# Modeling

In [68]:
# Ignoring warning messages from python
import warnings
warnings.filterwarnings('ignore')

# General use imports
import pandas as pd
import numpy as np

# Specific Modules
import prepare as prep
import json
import os
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


# # Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly
import plotly.express as px

## 1. Getting the data

In [2]:
df = prep.prep_data()

In [3]:
df.shape

(353, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 1 to 475
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   repo      353 non-null    object
 1   language  353 non-null    object
 2   content   353 non-null    object
dtypes: object(3)
memory usage: 11.0+ KB


In [5]:
df.head()

Unnamed: 0,repo,language,content
1,CharlesPikachu/Games,Python,div aligncenter img srcdocslogopng width600 di...
2,channingbreeze/games,JavaScript,### phaser phaserphaserphaserhttpwwwphaserchin...
3,arcxingye/EatKano,JavaScript,p aligncenter hrefhttpsxingyemegameeatkanoimg ...
4,coding-horror/basic-computer-games,C#,### updating first million selling computer bo...
5,rwv/chinese-dos-games,Python,# do do 1898 python 3 python python downloadda...


## 2. Splitting and vectorizing

In [6]:
# Calling the split function and displaying the shape of the datasets

train, validate, test = prep.split_data(df)

In [7]:
train.shape

(197, 3)

In [8]:
validate.shape

(85, 3)

In [9]:
test.shape

(71, 3)

In [10]:
# Vectorizing and 'learning' on train dataset

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train.content)
y_train = train.language

In [11]:
X_train

<197x13439 sparse matrix of type '<class 'numpy.float64'>'
	with 33640 stored elements in Compressed Sparse Row format>

In [12]:
y_train

115       C++
395      HTML
166        C#
26        C++
162       C++
        ...  
217    Python
156       C++
116    Python
158      Java
153      Java
Name: language, Length: 197, dtype: object

In [13]:
# Applying the vectorization without giving out the content of validate
X_validate = tfidf.transform(validate.content)
y_validate = validate.language

In [14]:
# Applying the vectorization without giving out the content of validate
X_test = tfidf.transform(test.content)
y_test = test.language

In [17]:
X_validate.shape, X_train.shape

((85, 13439), (197, 13439))

## Logistic Regression Models

>### Building a baseline and fitting the train dataset and modeling

In [18]:
# Creating a dataframe of the target variable
model_train = pd.DataFrame(dict(actual=y_train))

In [19]:
model_train.head()

Unnamed: 0,actual
115,C++
395,HTML
166,C#
26,C++
162,C++


In [20]:
# Checking the unique content of the actuals
model_train.actual.nunique(), model_train.actual.unique()

(8,
 array(['C++', 'HTML', 'C#', 'C', 'JavaScript', 'Java', 'Python',
        'TypeScript'], dtype=object))

In [21]:
# checking the value count to decide the baseline target
model_train.actual.value_counts()

JavaScript    50
C++           36
Python        33
C#            24
C             18
Java          16
HTML          10
TypeScript    10
Name: actual, dtype: int64

In [22]:
# Since JavaScript has the highest occurence I'll use it 
model_train['baseline'] = model_train['actual'] == 'JavaScript'
model_train.head()

Unnamed: 0,actual,baseline
115,C++,False
395,HTML,False
166,C#,False
26,C++,False
162,C++,False


In [94]:
# Calculating the baseline model
print(f'The baseline model has an average capture rate for Javascript of:{model_train.baseline.mean():.2%}')

The baseline model has an average capture rate for Javascript of:25.38%


In [24]:
# Applying the LogisticRegression (default settings) and predicting the occurence of the languages

lm = LogisticRegression().fit(X_train, y_train)

model_train['predicted'] = lm.predict(X_train)

In [25]:
# Displaying actual, baseline, and pedicted
model_train.head()

Unnamed: 0,actual,baseline,predicted
115,C++,False,C++
395,HTML,False,HTML
166,C#,False,C#
26,C++,False,C++
162,C++,False,C++


In [26]:
# Displaying the accuracy and c;assification report of the model

print('Accuracy: {:.2%}'.format(accuracy_score(model_train.actual, model_train.predicted)))
print('------------------')
print('Confusion Matrix')
print(pd.crosstab(model_train.predicted, model_train.actual))
print('------------------')
print('Classification Report')
print('\n')
print(classification_report(model_train.actual, model_train.predicted))

Accuracy: 85.79%
------------------
Confusion Matrix
actual       C  C#  C++  HTML  Java  JavaScript  Python  TypeScript
predicted                                                          
C           11   0    0     0     0           0       0           0
C#           0  24    0     0     0           0       0           0
C++          2   0   35     0     2           0       1           0
HTML         0   0    0     6     0           0       0           0
Java         0   0    0     0     9           0       0           0
JavaScript   5   0    1     4     5          50       0           8
Python       0   0    0     0     0           0      32           0
TypeScript   0   0    0     0     0           0       0           2
------------------
Classification Report


              precision    recall  f1-score   support

           C       1.00      0.61      0.76        18
          C#       1.00      1.00      1.00        24
         C++       0.88      0.97      0.92        36
       

>### Running model on validate dadtaset

In [31]:
model_train.head()

Unnamed: 0,actual,baseline,predicted
115,C++,False,C++
395,HTML,False,HTML
166,C#,False,C#
26,C++,False,C++
162,C++,False,C++


In [37]:
model_validate = pd.DataFrame(dict(actual=y_validate))

In [38]:
model_validate.head()

Unnamed: 0,actual
183,C++
140,JavaScript
334,C
60,Python
196,C++


In [39]:
# Predicting on validate
model_validate['validate_pred'] = lm.predict(X_validate)

In [40]:
model_validate.head()

Unnamed: 0,actual,validate_pred
183,C++,JavaScript
140,JavaScript,JavaScript
334,C,C++
60,Python,C++
196,C++,C++


In [41]:
print('Accuracy: {:.2%}'.format(accuracy_score(model_validate.actual, model_validate.validate_pred)))
print('------------------')
print('Confusion Matrix')
print(pd.crosstab(model_validate.validate_pred, model_validate.actual))
print('------------------')
print('Classification Report')
print('\n')
print(classification_report(model_validate.actual, model_validate.validate_pred))

Accuracy: 45.88%
------------------
Confusion Matrix
actual         C  C#  C++  HTML  Java  JavaScript  Python  TypeScript
validate_pred                                                        
C              1   0    0     0     0           0       0           0
C#             0   2    0     0     0           0       0           0
C++            1   0    6     0     3           0       1           0
HTML           0   0    0     2     0           0       0           0
JavaScript     5   9    9     2     4          22       7           4
Python         0   0    1     0     0           0       6           0
------------------
Classification Report


              precision    recall  f1-score   support

           C       1.00      0.14      0.25         7
          C#       1.00      0.18      0.31        11
         C++       0.55      0.38      0.44        16
        HTML       1.00      0.50      0.67         4
        Java       0.00      0.00      0.00         7
  JavaScript       

>### Summary  
        - The train dataset had an accuracy at 85.79%
        - The validate dataset went down to 45.88%
        - I will run a Random Forest Model to have a different perspective

## Random Forest Model

In [42]:
X_train

<197x13439 sparse matrix of type '<class 'numpy.float64'>'
	with 33640 stored elements in Compressed Sparse Row format>

In [95]:
model_train_rf = pd.DataFrame(dict(actual=y_train))

In [97]:
model_train_rf.head(2)

Unnamed: 0,actual
115,C++
395,HTML


In [101]:
# Writing a loop to capture a certain range of depth for the model to consider
for i in range(3, 13):
    multi_depth = RandomForestClassifier(max_depth=i, random_state=175)

    all_rf = multi_depth.fit(X_train, y_train)

    y_pred = all_rf.predict(X_train)

    report = classification_report(y_train, y_pred, output_dict=True)
    print(f'RandomForest depth {i}\n')
    print(pd.DataFrame(report))
    print('\n=======================\n')

RandomForest depth 3

              C     C#        C++   HTML       Java  JavaScript     Python  \
precision   0.0   1.00   1.000000   1.00   1.000000    0.316456   1.000000   
recall      0.0   0.25   0.361111   0.60   0.062500    1.000000   0.393939   
f1-score    0.0   0.40   0.530612   0.75   0.117647    0.480769   0.565217   
support    18.0  24.00  36.000000  10.00  16.000000   50.000000  33.000000   

           TypeScript  accuracy   macro avg  weighted avg  
precision         0.0  0.451777    0.664557      0.684380  
recall            0.0  0.451777    0.333444      0.451777  
f1-score          0.0  0.451777    0.355531      0.410026  
support          10.0  0.451777  197.000000    197.000000  


RandomForest depth 4

              C         C#        C++   HTML       Java  JavaScript  \
precision   0.0   1.000000   1.000000   1.00   1.000000    0.340136   
recall      0.0   0.375000   0.500000   0.60   0.062500    1.000000   
f1-score    0.0   0.545455   0.666667   0.75   0.1

>### The bext depth is depth 12 with an accuracy at 82.74%

>### On out-of-sample data, Validate

In [66]:
for i in range(3, 13):
    multi_depth = RandomForestClassifier(max_depth=i, random_state=175)

    all_rf = multi_depth.fit(X_validate, y_validate)

    y_pred_validate = all_rf.predict(X_validate)

    report = classification_report(y_validate, y_pred_validate, output_dict=True)
    print(f'RandomForest depth {i}\n')
    print(pd.DataFrame(report))
    print('\n=======================\n')

RandomForest depth 3

             C         C#        C++  HTML  Java  JavaScript     Python  \
precision  0.0   1.000000   1.000000   0.0   0.0    0.360656   1.000000   
recall     0.0   0.363636   0.687500   0.0   0.0    1.000000   0.500000   
f1-score   0.0   0.533333   0.814815   0.0   0.0    0.530120   0.666667   
support    7.0  11.000000  16.000000   4.0   7.0   22.000000  14.000000   

           TypeScript  accuracy  macro avg  weighted avg  
precision    1.000000  0.541176   0.545082      0.622758  
recall       0.500000  0.541176   0.381392      0.541176  
f1-score     0.666667  0.541176   0.401450      0.500781  
support      4.000000  0.541176  85.000000     85.000000  


RandomForest depth 4

                  C         C#        C++  HTML      Java  JavaScript  \
precision  1.000000   1.000000   1.000000   0.0  1.000000    0.431373   
recall     0.428571   0.727273   0.750000   0.0  0.142857    1.000000   
f1-score   0.600000   0.842105   0.857143   0.0  0.250000    0.6

>### Summary  
        - The best depth for the train dataset is 12 with an accuracy at 

## K Nearest Neighbor Model (KNN) 

In [69]:
# KNN Default values
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
KNeighborsClassifier()

In [None]:
y_pred = knn.predict(X_train)

In [72]:
for i in range(3, 13):
    knn_depth = KNeighborsClassifier(n_neighbors=i)

    all_knn = knn_depth.fit(X_train, y_train)

    y_pred_knn = all_knn.predict(X_train)

    report = classification_report(y_train, y_pred_knn, output_dict=True)
    print(f'KNN depth {i}\n')
    print(pd.DataFrame(report))
    print('\n=======================\n')

KNN depth 3

                   C     C#        C++   HTML       Java  JavaScript  \
precision   0.173077   1.00   0.952381   1.00   1.000000    1.000000   
recall      1.000000   0.25   0.555556   0.60   0.437500    0.560000   
f1-score    0.295082   0.40   0.701754   0.75   0.608696    0.717949   
support    18.000000  24.00  36.000000  10.00  16.000000   50.000000   

              Python  TypeScript  accuracy   macro avg  weighted avg  
precision   1.000000    1.000000  0.558376    0.890682      0.915742  
recall      0.666667    0.300000  0.558376    0.546215      0.558376  
f1-score    0.800000    0.461538  0.558376    0.591877      0.631099  
support    33.000000   10.000000  0.558376  197.000000    197.000000  


KNN depth 4

               C     C#        C++   HTML       Java  JavaScript     Python  \
precision   0.25   0.75   0.913043   1.00   0.909091    0.820513   0.923077   
recall      1.00   0.50   0.583333   0.60   0.625000    0.640000   0.727273   
f1-score    0.40   

>### On out-of-sample data

In [73]:
for i in range(3, 13):
    knn_depth = KNeighborsClassifier(n_neighbors=i)

    all_knn = knn_depth.fit(X_validate, y_validate)

    y_val_pred_knn = all_knn.predict(X_validate)

    report = classification_report(y_validate, y_val_pred_knn, output_dict=True)
    print(f'KNN depth {i}\n')
    print(pd.DataFrame(report))
    print('\n=======================\n')

KNN depth 3

             C    C#        C++      HTML      Java  JavaScript  Python  \
precision  0.0   0.0   0.202532  1.000000  1.000000    1.000000     0.0   
recall     0.0   0.0   1.000000  0.500000  0.285714    0.090909     0.0   
f1-score   0.0   0.0   0.336842  0.666667  0.444444    0.166667     0.0   
support    7.0  11.0  16.000000  4.000000  7.000000   22.000000    14.0   

           TypeScript  accuracy  macro avg  weighted avg  
precision         0.0  0.258824   0.400316      0.426359  
recall            0.0  0.258824   0.234578      0.258824  
f1-score          0.0  0.258824   0.201827      0.174517  
support           4.0  0.258824  85.000000     85.000000  


KNN depth 4

                  C         C#        C++  HTML  Java  JavaScript  Python  \
precision  1.000000   1.000000   0.202532   0.0   0.0         0.0     0.0   
recall     0.142857   0.454545   1.000000   0.0   0.0         0.0     0.0   
f1-score   0.250000   0.625000   0.336842   0.0   0.0         0.0     