In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from kmodes.kmodes import KModes
from sklearn.preprocessing import LabelEncoder
from kneed import KneeLocator

from personas.common.database import Database

In [41]:
db = Database(db=1)

uhopper_id = "223439979"
brand = db.get_brand(uhopper_id)

users = brand.followers

In [42]:
# Create users dataframe
users_dict = [user.__dict__ for user in users]
df_users = pd.DataFrame(users_dict)

df_users.head()

Unnamed: 0,id_,name,gender,location,profile_image_url,biography,type_,pref_language,site,followers_count,following_count,posts
0,226874885,Andrea Orlando,female,SaaS life 👉,https://pbs.twimg.com/profile_images/130020511...,Managing Partner @startupwiseguys 🇮🇹 | @insead...,anthroponym,en,https://www.startupwiseguys.com/saas,1017,1737,[<personas.common.post.Post object at 0x7f692e...
1,1347990208596336642,POP OUT Rete Nazionale Pubblicità Esterna,,Italia,https://pbs.twimg.com/profile_images/134799047...,Account della Rete Nazionale di Pubblicità Est...,brand-name,it,https://m.facebook.com/profile.php?id=10582547...,75,132,[<personas.common.post.Post object at 0x7f692c...
2,471106799,Karol Hanczarek,female,,https://pbs.twimg.com/profile_images/955957976...,,anthroponym,en,,36,234,[<personas.common.post.Post object at 0x7f6928...
3,1224055543184199683,Martina,female,"Cavedine, Trentino-Alto Adige",https://pbs.twimg.com/profile_images/135701641...,📚 Industrial Engineering Trento💎 design & tech...,anthroponym,it,,4,76,[<personas.common.post.Post object at 0x7f6927...
4,1366794177669984258,Nicola Farina,female,"Verona, Italy",https://pbs.twimg.com/profile_images/137156783...,Absolute mountain enthusiast! Studying Compute...,anthroponym,it,,0,8,[<personas.common.post.Post object at 0x7f6927...


In [43]:
cols_to_drop = ["id_", "name", "location", "profile_image_url", "biography", "site", "followers_count",
           "following_count", "posts", "type_"]
cols_to_cluster = [col for col in df_users.columns if col not in cols_to_drop]

In [44]:
# Drop rows with brand type
df_users = df_users[df_users["type_"] != "brand-name"]

# Drop rows which contain "NaN" values
df_users.fillna(value=np.nan, inplace=True)
df_users.dropna(inplace=True, subset=cols_to_cluster)

In [45]:
# PREPARE DATAFRAME FOR CLUSTERING
df_cluster = df_users.copy(deep=True)

# Replace posts with the number of posts
df_cluster["posts"] = [len(posts) for posts in df_cluster["posts"]]

# Categorize users into active and popular
df_cluster["active"] = [1 if posts >= 100 else 0 for posts in df_cluster["posts"]]
df_cluster["popular"] = [1 if int(followers) >= 100 else 0 for followers in df_cluster["followers_count"]]

# Drop irrelevant columns for clustering
df_cluster.drop(cols_to_drop, axis=1, inplace=True)

# Label-encode categorical columns
gender_encoder = LabelEncoder()
df_cluster["gender"] = gender_encoder.fit_transform(df_cluster["gender"])

lang_encoder = LabelEncoder()
df_cluster["pref_language"] = lang_encoder.fit_transform(df_cluster["pref_language"])

# Find optimal number of clusters (KModes)
costs = []
kmax = 15

for k in range(1, kmax+1):
    kmodes = KModes(n_clusters=k, init="Cao")
    kmodes.fit(df_cluster)
    costs.append(kmodes.cost_)

kn = KneeLocator(range(1, kmax + 1), costs, curve="convex", direction="decreasing")
number_clusters = kn.knee

print(number_clusters)

5


In [47]:
# Perform clustering
kmodes = KModes(n_clusters=number_clusters, init="Cao")
clusters = kmodes.fit_predict(df_cluster)
centroids = kmodes.cluster_centroids_

In [38]:
centroids_df = pd.DataFrame(kmodes.cluster_centroids_, columns=df_cluster.columns)
centroids_df["gender"] = gender_encoder.inverse_transform(centroids_df["gender"])
centroids_df["pref_language"] = lang_encoder.inverse_transform(centroids_df["pref_language"])
centroids_df["cluster"] = centroids_df.index

centroids_dict = centroids_df.to_dict("records")
print(centroids_dict)

[{'gender': 'male', 'pref_language': 'en', 'active': 1, 'popular': 1, 'cluster': 0}, {'gender': 'female', 'pref_language': 'it', 'active': 1, 'popular': 1, 'cluster': 1}, {'gender': 'female', 'pref_language': 'en', 'active': 0, 'popular': 0, 'cluster': 2}, {'gender': 'female', 'pref_language': 'fr', 'active': 1, 'popular': 1, 'cluster': 3}, {'gender': 'male', 'pref_language': 'en', 'active': 0, 'popular': 0, 'cluster': 4}]


In [13]:
# Add cluster attribute to original dataframe
df_users["cluster"] = clusters

i, j = 0, 0
while j < len(df_users):
    follower = brand.followers[i]
    clustered = df_users.iloc[j]
    if follower.id_ == clustered.id_:
        follower.cluster = clustered.cluster
        j += 1
    i += 1

z = 0
for follower in brand.followers:
    x = getattr(follower, "cluster", None)
    if x is not None:
        z += 1
print(z)

257
