In [1]:
# importing libraries
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

In [2]:
# Loading in the cleaned DF
with open("df.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,ID,Bios,Movie,TV,Music,Book,Sport,Vacation,People,Branch
0,19BCE0370,Fun. Talkative.,9,5,4,3,6,7,2,5
1,19BCE0148,Procrastinator. Sporty. Intuitive.,0,8,2,9,6,2,0,5
2,20BCE2014,Bored. Procrastinator. Ambitious. HardWorking. Emotinal. Sensitive.,5,2,2,8,9,1,8,5
3,20BEC0414,Nothing.,5,2,0,1,3,9,2,6
4,20BCE0684,Bold. Hardworking. Funny.,0,0,0,5,7,1,2,5


In [3]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,ID,Bios,Movie,TV,Music,Book,Sport,Vacation,People,Branch,Cluster #
96,20BEC0298,Confident. Geek. Dreamer.,2,3,5,7,0,3,8,6,5
97,20BCE2663,Extrovert.,0,9,4,7,1,2,8,5,7
98,20BCI0088,Learner.,0,2,0,9,6,7,2,5,8
99,20BCE2006,No.,0,2,0,5,0,9,6,5,8
100,20BCT0154,Programmer. Coding. Bodybuilder.,1,0,0,5,6,7,3,5,8


## Creating new profile data

In [5]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['ID'] = input("Enter your registration number: ")
new_profile['Bios'] = input("Enter a Bio for yourself: ")


# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.


In [6]:
new_profile

Unnamed: 0,ID,Bios,Movie,TV,Music,Book,Sport,Vacation,People,Branch
101,19BCE3333,Fun. Loving. Caring.,5,0,9,7,3,2,2,6


## Classification of the new profile

In [7]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [8]:
# Vectorizing the data
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)
X.drop(['ID'], axis=1, inplace=True)


  X = cluster_df.drop(["Cluster #"], 1)


In [9]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [10]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1).drop('ID',1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

  new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1).drop('ID',1)
  new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1).drop('ID',1)
  new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1).drop('ID',1)


In [11]:
new_vect_prof

Unnamed: 0,Movie,TV,Music,Book,Sport,Vacation,People,Branch,adaptable,adventures,...,thinker,thoughtful,traveler,traveling,truthful,tv,understanding,uninteresting,visiting,vivacious
101,0.555556,0.0,1.0,0.777778,0.333333,0.222222,0.222222,0.857143,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

### SVM is the best

In [25]:
svm = SVC()
# Fitting the model
svm.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = svm.predict(new_vect_prof)

designated_cluster

array([12])

In [27]:
des_cluster = cluster_df[cluster_df['Cluster #']==designated_cluster[0]]

des_cluster

Unnamed: 0,ID,Bios,Movie,TV,Music,Book,Sport,Vacation,People,Branch,Cluster #
12,19BCE2216,vivacious. personable. stubborn.,9,4,0,3,0,1,8,5,12
14,19BCE0176,laidback.,8,2,2,8,0,4,6,5,12
15,21VCA01999,good.,8,4,0,8,2,9,6,1,12
17,21BBA0217,Ambivert.,5,0,4,9,4,7,2,7,12
29,19BCE0350,Lark. Sports enthusiast. Artist.,8,2,6,8,0,0,6,5,12
36,20BCB0132,Smart. clever.,9,8,0,9,0,1,5,5,12
38,20BIT0332,Awesome.,9,0,6,7,2,3,2,5,12
40,19BCE2132,Funny. Clever.,9,0,4,5,4,6,2,5,12
44,20BCE0505,Ambivert. Empathetic. Beautiful.,7,2,0,3,0,9,6,5,12
49,19BCE2262,Quiet. Brave. Neat.,9,9,1,8,1,1,9,5,12


## Finding top 10 similar profiles to current profile

In [22]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #', 'ID'], axis=1)


des_cluster

  des_cluster = des_cluster.append(new_profile, sort=False)


Unnamed: 0,Movie,TV,Music,Book,Sport,Vacation,People,Branch,ambivert,artist,...,loving,neat,openminded,personable,quiet,simple,smart,sports,stubborn,vivacious
12,9,4,0,3,0,1,8,5,0,0,...,0,0,0,1,0,0,0,0,1,1
14,8,2,2,8,0,4,6,5,0,0,...,0,0,0,0,0,0,0,0,0,0
15,8,4,0,8,2,9,6,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17,5,0,4,9,4,7,2,7,1,0,...,0,0,0,0,0,0,0,0,0,0
29,8,2,6,8,0,0,6,5,0,1,...,0,0,0,0,0,0,0,1,0,0
36,9,8,0,9,0,1,5,5,0,0,...,0,0,0,0,0,0,1,0,0,0
38,9,0,6,7,2,3,2,5,0,0,...,0,0,0,0,0,0,0,0,0,0
40,9,0,4,5,4,6,2,5,0,0,...,0,0,0,0,0,0,0,0,0,0
44,7,2,0,3,0,9,6,5,1,0,...,0,0,0,0,0,0,0,0,0,0
49,9,9,1,8,1,1,9,5,0,0,...,0,1,0,0,1,0,0,0,0,0


In [18]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]
top_10_sim

38    0.909334
17    0.842701
29    0.838223
40    0.802417
14    0.726803
56    0.721259
87    0.629759
83    0.594147
36    0.547846
85    0.542628
dtype: float64

In [30]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,ID,Bios,Movie,TV,Music,Book,Sport,Vacation,People,Branch
38,20BIT0332,Awesome.,9,0,6,7,2,3,2,5
17,21BBA0217,Ambivert.,5,0,4,9,4,7,2,7
29,19BCE0350,Lark. Sports enthusiast. Artist.,8,2,6,8,0,0,6,5
40,19BCE2132,Funny. Clever.,9,0,4,5,4,6,2,5
14,19BCE0176,laidback.,8,2,2,8,0,4,6,5
56,19BCE0461,Ambivert.,8,5,4,8,2,7,8,5
87,20BCE0463,Charming. Charismatic.,9,9,3,6,4,0,3,5
83,20BEC0697,Openminded. Simple.,9,5,1,7,0,8,7,6
36,20BCB0132,Smart. clever.,9,8,0,9,0,1,5,5
85,20BCE0494,Calm.,9,8,0,6,4,1,2,5


In [29]:
from joblib import dump

dump(svm, "clf_model.joblib")

['clf_model.joblib']