# Modeling

In [44]:
# Ignoring warning messages from python
import warnings
warnings.filterwarnings('ignore')

# General use imports
import pandas as pd
import numpy as np

# Specific Modules
import prepare as prep
import json
import os
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


# # Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.graph_objects as go
# import plotly
# import plotly.express as px

In [2]:
df = prep.prep_data()

In [3]:
df.shape

(353, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 1 to 475
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   repo      353 non-null    object
 1   language  353 non-null    object
 2   content   353 non-null    object
dtypes: object(3)
memory usage: 11.0+ KB


In [5]:
df.head()

Unnamed: 0,repo,language,content
1,CharlesPikachu/Games,Python,div aligncenter img srcdocslogopng width600 di...
2,channingbreeze/games,JavaScript,### phaser phaserphaserphaserhttpwwwphaserchin...
3,arcxingye/EatKano,JavaScript,p aligncenter hrefhttpsxingyemegameeatkanoimg ...
4,coding-horror/basic-computer-games,C#,### updating first million selling computer bo...
5,rwv/chinese-dos-games,Python,# do do 1898 python 3 python python downloadda...


## Splitting and vectorizing

In [6]:
train, validate, test = prep.split_data(df)

In [7]:
train.shape

(197, 3)

In [8]:
validate.shape

(85, 3)

In [9]:
test.shape

(71, 3)

In [10]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train.content)
y_train = train.language

In [11]:
X_train

<197x13439 sparse matrix of type '<class 'numpy.float64'>'
	with 33640 stored elements in Compressed Sparse Row format>

In [12]:
y_train

115       C++
395      HTML
166        C#
26        C++
162       C++
        ...  
217    Python
156       C++
116    Python
158      Java
153      Java
Name: language, Length: 197, dtype: object

In [13]:
# tfidf = TfidfVectorizer()
X_validate = tfidf.transform(validate.content)
y_validate = validate.language

In [14]:
# tfidf = TfidfVectorizer()
X_test = tfidf.transform(test.content)
y_test = test.language

In [17]:
X_validate.shape, X_train.shape

((85, 13439), (197, 13439))

## Logistic Regression Models

>### Train

In [18]:
model_train = pd.DataFrame(dict(actual=y_train))

In [19]:
model_train.head()

Unnamed: 0,actual
115,C++
395,HTML
166,C#
26,C++
162,C++


In [20]:
model_train.actual.nunique(), model_train.actual.unique()

(8,
 array(['C++', 'HTML', 'C#', 'C', 'JavaScript', 'Java', 'Python',
        'TypeScript'], dtype=object))

In [21]:
model_train.actual.value_counts()

JavaScript    50
C++           36
Python        33
C#            24
C             18
Java          16
HTML          10
TypeScript    10
Name: actual, dtype: int64

In [22]:
model_train['baseline'] = model_train['actual'] == 'JavaScript'
model_train.head()

Unnamed: 0,actual,baseline
115,C++,False
395,HTML,False
166,C#,False
26,C++,False
162,C++,False


In [23]:
model_train.baseline.mean()

0.25380710659898476

In [24]:
lm = LogisticRegression().fit(X_train, y_train)

model_train['predicted'] = lm.predict(X_train)

In [25]:
model_train.head()

Unnamed: 0,actual,baseline,predicted
115,C++,False,C++
395,HTML,False,HTML
166,C#,False,C#
26,C++,False,C++
162,C++,False,C++


In [None]:
model_train['predicted'] = lm.predict(X_train)

In [26]:
print('Accuracy: {:.2%}'.format(accuracy_score(model_train.actual, model_train.predicted)))
print('------------------')
print('Confusion Matrix')
print(pd.crosstab(model_train.predicted, model_train.actual))
print('------------------')
print('Classification Report')
print('\n')
print(classification_report(model_train.actual, model_train.predicted))

Accuracy: 85.79%
------------------
Confusion Matrix
actual       C  C#  C++  HTML  Java  JavaScript  Python  TypeScript
predicted                                                          
C           11   0    0     0     0           0       0           0
C#           0  24    0     0     0           0       0           0
C++          2   0   35     0     2           0       1           0
HTML         0   0    0     6     0           0       0           0
Java         0   0    0     0     9           0       0           0
JavaScript   5   0    1     4     5          50       0           8
Python       0   0    0     0     0           0      32           0
TypeScript   0   0    0     0     0           0       0           2
------------------
Classification Report


              precision    recall  f1-score   support

           C       1.00      0.61      0.76        18
          C#       1.00      1.00      1.00        24
         C++       0.88      0.97      0.92        36
       

>### Validate

In [31]:
model_train.head()

Unnamed: 0,actual,baseline,predicted
115,C++,False,C++
395,HTML,False,HTML
166,C#,False,C#
26,C++,False,C++
162,C++,False,C++


In [37]:
model_validate = pd.DataFrame(dict(actual=y_validate))

In [38]:
model_validate.head()

Unnamed: 0,actual
183,C++
140,JavaScript
334,C
60,Python
196,C++


In [39]:
model_validate['validate_pred'] = lm.predict(X_validate)

In [40]:
model_validate.head()

Unnamed: 0,actual,validate_pred
183,C++,JavaScript
140,JavaScript,JavaScript
334,C,C++
60,Python,C++
196,C++,C++


In [41]:
print('Accuracy: {:.2%}'.format(accuracy_score(model_validate.actual, model_validate.validate_pred)))
print('------------------')
print('Confusion Matrix')
print(pd.crosstab(model_validate.validate_pred, model_validate.actual))
print('------------------')
print('Classification Report')
print('\n')
print(classification_report(model_validate.actual, model_validate.validate_pred))

Accuracy: 45.88%
------------------
Confusion Matrix
actual         C  C#  C++  HTML  Java  JavaScript  Python  TypeScript
validate_pred                                                        
C              1   0    0     0     0           0       0           0
C#             0   2    0     0     0           0       0           0
C++            1   0    6     0     3           0       1           0
HTML           0   0    0     2     0           0       0           0
JavaScript     5   9    9     2     4          22       7           4
Python         0   0    1     0     0           0       6           0
------------------
Classification Report


              precision    recall  f1-score   support

           C       1.00      0.14      0.25         7
          C#       1.00      0.18      0.31        11
         C++       0.55      0.38      0.44        16
        HTML       1.00      0.50      0.67         4
        Java       0.00      0.00      0.00         7
  JavaScript       

## Random Forest

In [42]:
X_train

<197x13439 sparse matrix of type '<class 'numpy.float64'>'
	with 33640 stored elements in Compressed Sparse Row format>

In [None]:
RandomForestClassifier(bootstrap=True, 
                    class_weight=None, 
                    criterion='gini',
                    min_samples_leaf=1,
                    n_estimators=100,
                    max_depth=3, 
                    random_state=175)

In [56]:
for i in range(3, 13):
    multi_depth = RandomForestClassifier(max_depth=i, random_state=175)

    all_rf = multi_depth.fit(X_train, y_train)

    y_pred = all_rf.predict(X_train)

    report = classification_report(y_train, y_pred, output_dict=True)
#     best_depth = report
    print(f'RandomForest depth {i}\n')
    print(pd.DataFrame(report))
    print('\n=======================\n')
#     print(f'Best Depth {})

RandomForest depth 3

              C     C#        C++   HTML       Java  JavaScript     Python  \
precision   0.0   1.00   1.000000   1.00   1.000000    0.316456   1.000000   
recall      0.0   0.25   0.361111   0.60   0.062500    1.000000   0.393939   
f1-score    0.0   0.40   0.530612   0.75   0.117647    0.480769   0.565217   
support    18.0  24.00  36.000000  10.00  16.000000   50.000000  33.000000   

           TypeScript  accuracy   macro avg  weighted avg  
precision         0.0  0.451777    0.664557      0.684380  
recall            0.0  0.451777    0.333444      0.451777  
f1-score          0.0  0.451777    0.355531      0.410026  
support          10.0  0.451777  197.000000    197.000000  


RandomForest depth 4

              C         C#        C++   HTML       Java  JavaScript  \
precision   0.0   1.000000   1.000000   1.00   1.000000    0.340136   
recall      0.0   0.375000   0.500000   0.60   0.062500    1.000000   
f1-score    0.0   0.545455   0.666667   0.75   0.1

In [57]:
list(report)

['C',
 'C#',
 'C++',
 'HTML',
 'Java',
 'JavaScript',
 'Python',
 'TypeScript',
 'accuracy',
 'macro avg',
 'weighted avg']