In [4]:
## Importing Libraries
import requests
import pandas as pd
import time
import random
import re
import numpy as np
import _pickle as pickle
from tqdm import tqdm_notebook as tqdm
from bs4 import BeautifulSoup as bs
pd.set_option('display.max_colwidth', 500)
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler

In [5]:
#scraping the data from website using BeautifulSoup

biolist = []
page = requests.get("https://www.fakepersongenerator.com/user-biography-generator")
soup = bs(page.content)
bios = soup.find('div', class_='row no-margin for-sign').find_all('p')
biolist.extend([re.findall('"([^"]*)"', i.text) for i in bios])
  
# Creating a df from the bio list
bio_df = pd.DataFrame(biolist, columns=['Bios'])
bio_df.head()

Unnamed: 0,Bios
0,Problem solver. Award-winning bacon junkie. Professional travel expert. Web evangelist. Avid internet fan.
1,Internet maven. Proud coffee advocate. Beer guru. Avid web expert. Food aficionado.
2,Music scholar. Communicator. Entrepreneur. Analyst. Proud creator. Freelance zombie fanatic. Friendly travel maven.
3,Analyst. Food fan. Incurable communicator. Coffee aficionado. Music expert. Beer specialist.
4,Proud bacon buff. Zombie practitioner. Analyst. Freelance musicaholic. Thinker. Coffee maven.


In [6]:
bio_df.shape

(15, 1)

In [7]:
#Adding attributes
qs = ['Movies','TV','Religion','Music','Sports','Books','Politics']

# Creating a df of the categories
topic_df = pd.DataFrame(columns=qs)

# Filling in Data
for i in topic_df.columns:
    
    # Range of numbers to represent different labels in each category
    topic_df[i] = np.random.randint(0,10, bio_df.shape[0])

# Joining df
final_df = bio_df.join(topic_df)

with open("profiles.pkl", "wb") as fp:
    pickle.dump(final_df, fp)

In [8]:
final_df = final_df[:4]
final_df['gender'] = [0 if x<2 else 1 for x in range(len(final_df))]
final_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,gender
0,Problem solver. Award-winning bacon junkie. Professional travel expert. Web evangelist. Avid internet fan.,6,7,6,5,3,3,2,0
1,Internet maven. Proud coffee advocate. Beer guru. Avid web expert. Food aficionado.,0,2,8,9,6,8,2,0
2,Music scholar. Communicator. Entrepreneur. Analyst. Proud creator. Freelance zombie fanatic. Friendly travel maven.,6,4,1,6,6,2,1,1
3,Analyst. Food fan. Incurable communicator. Coffee aficionado. Music expert. Beer specialist.,4,6,4,3,1,6,7,1


In [9]:
# 0 represent male and 1 female
final_df['gender'].value_counts() #

1    2
0    2
Name: gender, dtype: int64

In [10]:
lemmatizer = WordNetLemmatizer()

# Tokenizing Function
def tokenize(text):

    # Creating a library of stopwords
    stops = stopwords.words('english')
    
    # Lowercasing the words
    text = text.lower()
    
    # Removing the punctuations (periods)
    text = text.replace('.', '')
    
    # Splitting on spaces between words
    text = text.split(' ')
    
    # Lemmatizing the words and removing stop words
    text = [lemmatizer.lemmatize(i) for i in text if i not in stops]
    
    return text
  
# final_df['Bios'] = final_df.Bios.apply(tokenize)

In [11]:
scaler = MinMaxScaler()
# Scaling the categories then replacing the old values
df = final_df[['Bios']].join(pd.DataFrame(scaler.fit_transform(final_df.drop('Bios',axis=1)), 
                                          columns=final_df.columns[1:], index=final_df.index))

In [12]:
df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,gender
0,Problem solver. Award-winning bacon junkie. Professional travel expert. Web evangelist. Avid internet fan.,1.0,1.0,0.714286,0.333333,0.4,0.166667,0.166667,0.0
1,Internet maven. Proud coffee advocate. Beer guru. Avid web expert. Food aficionado.,0.0,0.0,1.0,1.0,1.0,1.0,0.166667,0.0
2,Music scholar. Communicator. Entrepreneur. Analyst. Proud creator. Freelance zombie fanatic. Friendly travel maven.,1.0,0.4,0.0,0.5,1.0,0.0,0.0,1.0
3,Analyst. Food fan. Incurable communicator. Coffee aficionado. Music expert. Beer specialist.,0.666667,0.8,0.428571,0.0,0.0,0.666667,1.0,1.0


In [13]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()
#vectorizer = TfidfVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

In [14]:
new_df.head()

Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,gender,advocate,aficionado,analyst,avid,award,bacon,beer,coffee,communicator,creator,entrepreneur,evangelist,expert,fan,fanatic,food,freelance,friendly,guru,incurable,internet,junkie,maven,music,problem,professional,proud,scholar,solver,specialist,travel,web,winning,zombie
0,1.0,1.0,0.714286,0.333333,0.4,0.166667,0.166667,0.0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,1,1,1,0
1,0.0,0.0,1.0,1.0,1.0,1.0,0.166667,0.0,1,1,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0
2,1.0,0.4,0.0,0.5,1.0,0.0,0.0,1.0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,1,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,1
3,0.666667,0.8,0.428571,0.0,0.0,0.666667,1.0,1.0,0,1,1,0,0,0,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [15]:
new_df.shape

(4, 42)

In [16]:
# Finding Correlations among the users
corr_group = new_df.T.corr()

# Finding the Similar Users
# Randomly selecting a user
random_user = random.choice(corr_group.index)

print("Top similar users to User #", random_user, '\n')

# Creating a df with the most similar users to the selected user
top_sim = corr_group[[random_user]].sort_values(by=[random_user],axis=0, ascending=False)

# Printing out the results
print(top_sim)

print("\nThe most similar user to User #", random_user, "is User #", top_sim.index[1])

Top similar users to User # 1 

          1
1  1.000000
3  0.029021
0 -0.125833
2 -0.309592

The most similar user to User # 1 is User # 3


In [18]:
# We can also play with TfidfVectorizer.

Q. How ML model can be integrated with the backend of a Flutter App?

Answer. This can be integrated with the backend of flutter app by either creating and hosting a web API Service (say using Flask) and calling the API service from flutter application to fetch the result. On the other hand I can use firebase_ml_custom plugin to manage hosting the model and downloading it to user's device. 