In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import FunctionTransformer

## Read Data From File

In [7]:
data = pd.read_csv('../data/training_data.csv', index_col=0)
data.fillna(0, inplace=True)

In [8]:
data.head()

Unnamed: 0,Username,Id,Contributions,JavaScript,Python,Java,C#,PHP,TypeScript,Ruby,...,Dart,Vue,Assembly,Sass,CSS,HTML,Pascal,Racket,Zig,Other
mojombo,1,79,0,1970379,0,0,0,0,2566242,928419,...,0,0,0,0,7758,17442,0,0,0,240167
defunkt,2,1,0,4609687,0,0,0,0,0,470103,...,0,0,0,0,6399,23938,0,0,0,896415
wycats,4,358,0,3378185,0,0,0,0,2687760,952,...,0,0,0,0,21641,441513,0,0,0,3060
brynary,19,3481,0,11661,0,0,0,0,0,951748,...,0,0,0,0,0,17954,0,0,0,3142
kevinclark,20,18,0,0,0,0,0,0,0,43311,...,0,0,0,0,0,0,0,0,0,8688


In [None]:
data.describe()

In [None]:
data.mean()

In [None]:
data.info()

## Transform Data

#### 1. Make the bytes into percentages

In [None]:
col = ["Id", "Contributions", "JavaScript", "Python", "Java", "C#", "PHP", "TypeScript", "Ruby", "C++", "C", "Swift", "Go", "Shell", "Kotlin", "Rust", "PowerShell", "Objective-C", "R", "MATLAB", "Dart", "Vue", "Assembly", "Sass", "CSS", "HTML", "Pascal", "Racket", "Zig", "Other"]
tcols = [x + '-T' for x in col]
def turn_to_percent(X, columns):
    X[columns] = X[columns].div(X[columns].sum(axis=1), axis=0)
    return X

# Create a FunctionTransformer using the defined function and pass the subset_columns argument
transformer = FunctionTransformer(turn_to_percent, validate=False, kw_args={'columns': col[2:]})

# Apply the transformation to your dataset
data = transformer.transform(data)
data

#### 2. Standardize the data using a StandardScaler

In [None]:
scaler = StandardScaler()

data[tcols] = scaler.fit_transform(data[col])
data.head()

#### 3. Determine Number of Clusters

In [None]:
def optimize_kmeans(data, max_k):
    inertia = []
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertia.append(kmeans.inertia_)
    
    # generate elbow
    fig = plt.figure(figsize=(10, 5))
    plt.plot(range(1, max_k), inertia)
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Elbow Method')
    plt.show()

In [None]:
optimize_kmeans(data[tcols], 40)

#### 4 Apply K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=25, random_state=0).fit(data[tcols])
data['cluster'] = kmeans.labels_
data.head()

## Write K-Means Model to File

In [None]:
with open('../data/kmeansmodel.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

## Write Means to File

In [None]:
mean = data.mean()
std = data.std()
with open("../data/meanAndStd.csv", "w") as f:
    f.write(",".join([x for x in col]) + "\n")
    f.write(",".join([str(mean[x]) for x in col]) + "\n")
    f.write(",".join([str(std[x]) for x in col]) + "\n")


In [None]:
data.columns