In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

import explore
import modeling

In [2]:
df = explore.make_initial_df()
df = explore.add_new_columns(df)

In [3]:
df = explore.bin_link_counts(df)
df = explore.bin_word_counts(df)

In [4]:
# make vectorized_df
vectorized_df = explore.make_vectorized_df(df)
# add new columns to vectorized_df
vectorized_df = explore.aggregate_columns(vectorized_df)
vectorized_df.head()

Unnamed: 0,00,000,0000,000000,00008100,0001twosumproblems1twosumenmd,0002,0003,0004732,0004medianoftwosortedarrayproblems4medianoftwosortedarraymd,...,provide+,recommend+,release+,require+,run+,start+,support+,use+,work+,politeness
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065301
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001941,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014578,0.03761,0.030403,0.030292,0.0,0.056998


In [5]:
# scale and make test predictions dfs
X_train_scaled, X_test_scaled, y_train, y_test, train_predictions, test_predictions = explore.get_splits(df, vectorized_df)
# reduce the vectorized dataframe 
X_train_reduced, X_test_reduced = explore.prep_vectorized_df(X_train_scaled, X_test_scaled)

train_predictions.head()

Unnamed: 0,actual
279,JavaScript
156,Jupyter Notebook
31,Python
125,JavaScript
110,other


In [6]:
# make predictions
train_predictions, test_predictions = modeling.make_predictions_df(X_train_reduced, X_test_reduced, y_train, train_predictions, test_predictions)



In [7]:
train_predictions.head()

Unnamed: 0,actual,baseline,lr_predictions,rf_predictions,knn_predictions
279,JavaScript,Python,JavaScript,JavaScript,JavaScript
156,Jupyter Notebook,Python,Jupyter Notebook,Jupyter Notebook,Jupyter Notebook
31,Python,Python,Python,Python,JavaScript
125,JavaScript,Python,JavaScript,JavaScript,JavaScript
110,other,Python,other,Python,Java


In [8]:
evaluation_report = modeling.train_evaluation(train_predictions)

Evaluation Metrics for Logistic Regression Model


Accuracy: 91.83%
----------------------------------------------------------------------------------------------
Confusion Matrix
actual            C++  Java  JavaScript  Jupyter Notebook  Python  TypeScript  \
lr_predictions                                                                  
C++                14     0           0                 0       0           0   
Java                0     5           0                 0       0           0   
JavaScript          2     0          45                 1       1           1   
Jupyter Notebook    0     1           0                27       0           0   
Python              0     3           0                 4      62           0   
TypeScript          0     0           0                 0       0          14   
other               0     0           0                 1       0           0   

actual            other  
lr_predictions           
C++                   0  
Java        

  'precision', 'predicted', average, warn_for)


In [9]:
def test_evaluation(test_predictions):
    # Logistic regression accuracy score, confustion matrix, classification report for test data
    print('Evaluation Metrics for Logistic Regression Model')
    print()
    print()
    print('Accuracy: {:.2%}'.format(accuracy_score(test_predictions.actual, test_predictions.lr_predictions)))
    print('----------------------------------------------------------------------------------------------')
    print('Confusion Matrix')
    print(pd.crosstab(test_predictions.lr_predictions, test_predictions.actual))
    print('----------------------------------------------------------------------------------------------')
    print(classification_report(test_predictions.actual, test_predictions.lr_predictions))

    print()
    print()
    print()
    print()
    print()

In [10]:
test_evaluation(test_predictions)

Evaluation Metrics for Logistic Regression Model


Accuracy: 54.44%
----------------------------------------------------------------------------------------------
Confusion Matrix
actual            C++  Java  JavaScript  Jupyter Notebook  Python  TypeScript  \
lr_predictions                                                                  
C++                 3     0           0                 1       0           0   
JavaScript          1     0          14                 0       1           5   
Jupyter Notebook    0     0           0                 6       1           0   
Python              2     3           4                 7      24           0   
TypeScript          0     0           1                 0       0           2   
other               1     1           1                 0       1           0   

actual            other  
lr_predictions           
C++                   0  
JavaScript            0  
Jupyter Notebook      1  
Python               10  
TypeScript     

In [11]:
df.link_bins

0      medium
1       small
2       small
3       small
4      medium
        ...  
293     small
294     small
295     small
296     small
297     small
Name: link_bins, Length: 298, dtype: category
Categories (3, object): [small < medium < large]

In [12]:
# make train and test precitions dataframe starting with actual values
# train_predictions = pd.DataFrame(y_train)
# train_predictions.columns = ['actual']
# test_predictions = pd.DataFrame(y_test)
# test_predictions.columns = ['actual']

# add baseline to predictions dataframe
# train_predictions['baseline'] = 'Python'
# test_predictions['baseline'] = 'Python'
# train_predictions

# test_predictions

# train, test = modeling.logistic_regression(df, vectorized_df)

# train

# rf = RandomForestClassifier(bootstrap=True, 
#                             class_weight=None, 
#                             criterion='gini',
#                             min_samples_leaf=3,
#                             n_estimators=100,
#                             max_depth=3, 
#                             random_state=123)

# rf.fit(X_train, y_train)

# train_random_forest_predictions = rf.predict(X_train)
# test_random_forest_predictions = rf.predict(X_test)

# knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# knn.fit(X_train, y_train)

# train_knn_predictions = knn.predict(X_train)
# test_knn_predictions = knn.predict(X_test)

# X_train_scaled, X_test_scaled, X_train_reduced, X_test_reduced = explore.prep_vectorized_df(df, vectorized_df)

# X_train_scaled, X_test_scaled, X_train_reduced, X_test_reduced = modeling.make_predictions_df(df, vectorized_df)

