In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

import explore
import modeling

In [2]:
#df = explore.make_initial_df()
#df = explore.add_new_columns(df)

In [4]:
df = pd.read_csv('prepared_df.csv')

In [5]:
df.language.value_counts()

Python              90
JavaScript          65
Jupyter Notebook    47
C++                 23
TypeScript          22
Java                13
HTML                 6
Go                   6
Vue                  4
TeX                  2
Swift                2
Ruby                 2
Shell                2
PHP                  2
C#                   2
Scala                1
CSS                  1
Lua                  1
CoffeeScript         1
R                    1
Perl                 1
Julia                1
Clojure              1
C                    1
Kotlin               1
Name: language, dtype: int64

In [6]:
vectorized_df = explore.make_vectorized_df(df)

In [6]:
X = vectorized_df
y = df.gen_language
#Slpit dataframe into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state = 123)

In [7]:
def logistic_regression(train, test):
    train = pd.DataFrame(dict(actual=y_train))
    test = pd.DataFrame(dict(actual=y_test))

    lm = LogisticRegression().fit(X_train, y_train)

    train['predicted'] = lm.predict(X_train)
    test['predicted'] = lm.predict(X_test)
    
    print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
    print('---')
    print('Confusion Matrix')
    print(pd.crosstab(train.predicted, train.actual))
    print('---')
    print(classification_report(train.actual, train.predicted))

In [8]:
# make train and test precitions dataframe starting with actual values
# train_predictions = pd.DataFrame(y_train)
# train_predictions.columns = ['actual']
# test_predictions = pd.DataFrame(y_test)
# test_predictions.columns = ['actual']

# add baseline to predictions dataframe
# train_predictions['baseline'] = 'Python'
# test_predictions['baseline'] = 'Python'
# train_predictions

In [9]:
# test_predictions

In [10]:
# train, test = modeling.logistic_regression(df, vectorized_df)

In [11]:
# train

In [12]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [13]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [14]:
train_random_forest_predictions = rf.predict(X_train)
test_random_forest_predictions = rf.predict(X_test)

In [15]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [16]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
train_knn_predictions = knn.predict(X_train)
test_knn_predictions = knn.predict(X_test)

In [18]:
#X_train_scaled, X_test_scaled, X_train_reduced, X_test_reduced = explore.prep_vectorized_df(df, vectorized_df)

In [19]:
#X_train_scaled, X_test_scaled, X_train_reduced, X_test_reduced = modeling.make_predictions_df(df, vectorized_df)

ValueError: not enough values to unpack (expected 7, got 6)

In [2]:
# read csv file for prepared_df
df = pd.read_csv('prepared_df.csv')
# make vectorized_df
vectorized_df = explore.make_vectorized_df(df)
# add new columns to vectorized_df
vectorized_df = explore.aggregate_columns(vectorized_df)
vectorized_df.head()

Unnamed: 0,00,000,0000,000000,00008100,0001twosumproblems1twosumenmd,0002,0003,0004732,0004medianoftwosortedarrayproblems4medianoftwosortedarraymd,...,provide+,recommend+,release+,require+,run+,start+,support+,use+,work+,politeness
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065057
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001941,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014591,0.03747,0.030429,0.030273,0.0,0.057046


In [3]:
# scale and make test predictions dfs
X_train_scaled, X_test_scaled, y_train, y_test, train_predictions, test_predictions = explore.get_splits(df, vectorized_df)
# reduce the vectorized dataframe 
X_train_reduced, X_test_reduced = explore.prep_vectorized_df(X_train_scaled, X_test_scaled)

train_predictions.head()

Unnamed: 0,actual
279,JavaScript
156,Jupyter Notebook
31,Python
125,JavaScript
110,other


In [4]:
# make predictions
train_predictions, test_predictions = modeling.make_predictions_df(X_train_reduced, X_test_reduced, y_train, train_predictions, test_predictions)

In [5]:
train_predictions.head()

Unnamed: 0,actual,baseline,lr_predictions,rf_predictions,knn_predictions
279,JavaScript,Python,JavaScript,JavaScript,Java
156,Jupyter Notebook,Python,Jupyter Notebook,Jupyter Notebook,Jupyter Notebook
31,Python,Python,Python,Python,JavaScript
125,JavaScript,Python,JavaScript,JavaScript,JavaScript
110,other,Python,other,Python,Java


In [6]:
modeling.train_evaluation(train_predictions)

Evaluation Metrics for Logistic Regression Model


Accuracy: 93.27%
----------------------------------------------------------------------------------------------
Confusion Matrix
actual            C++  Java  JavaScript  Jupyter Notebook  Python  TypeScript  \
lr_predictions                                                                  
C++                14     0           0                 0       0           0   
Java                0     7           0                 0       0           0   
JavaScript          2     0          45                 1       1           1   
Jupyter Notebook    0     0           0                28       0           0   
Python              0     2           0                 3      62           0   
TypeScript          0     0           0                 0       0          14   
other               0     0           0                 1       0           0   

actual            other  
lr_predictions           
C++                   0  
Java        

  _warn_prf(average, modifier, msg_start, len(result))
