In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import explore
import modeling

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = explore.make_initial_df('repo_source')
df = explore.add_new_columns(df)

In [3]:
vectorized_df = explore.make_vectorized_df(df)

In [4]:
X_train, X_test, y_train, y_test, train_predictions, test_predictions = modeling.get_splits(df, vectorized_df)

In [5]:
train_predictions, test_predictions = modeling.make_predictions_df(df, vectorized_df)

In [6]:
train_predictions

Unnamed: 0,actual,baseline,lr_predictions,rf_predictions,knn_predictions
17,Jupyter Notebook,Python,Jupyter Notebook,Jupyter Notebook,Jupyter Notebook
14,Python,Python,Python,other,JavaScript
19,other,Python,other,other,JavaScript
4,Python,Python,Python,Python,Python
21,Jupyter Notebook,Python,Jupyter Notebook,Jupyter Notebook,Jupyter Notebook
2,Jupyter Notebook,Python,Jupyter Notebook,Jupyter Notebook,Jupyter Notebook
23,other,Python,other,other,other
12,Python,Python,Python,Python,Python
11,Python,Python,other,Python,other
15,Python,Python,Python,Python,JavaScript


In [7]:
# print('Accuracy: {:.2%}'.format(accuracy_score(train_predictions.actual, train_predictions.lr_predictions)))
# print('---')
# print('Confusion Matrix')
# print(pd.crosstab(train_predictions.lr_predictions, train_predictions.actual))
# print('---')
# print(classification_report(train_predictions.actual, train_predictions.lr_predictions))

In [8]:
# print('Accuracy: {:.2%}'.format(accuracy_score(train_predictions.actual, train_predictions.rf_predictions)))
# print('---')
# print('Confusion Matrix')
# print(pd.crosstab(train_predictions.rf_predictions, train_predictions.actual))
# print('---')
# print(classification_report(train_predictions.actual, train_predictions.rf_predictions))

In [9]:
# print('Accuracy: {:.2%}'.format(accuracy_score(train_predictions.actual, train_predictions.knn_predictions)))
# print('---')
# print('Confusion Matrix')
# print(pd.crosstab(train_predictions.knn_predictions, train_predictions.actual))
# print('---')
# print(classification_report(train_predictions.actual, train_predictions.knn_predictions))

In [10]:
report = modeling.train_evaluation(train_predictions)

Evaluation Metrics for Logistic Regression Model


Accuracy: 90.00%
----------------------------------------------------------------------------------------------
Confusion Matrix
actual            JavaScript  Jupyter Notebook  Python  other
lr_predictions                                               
JavaScript                 4                 0       0      0
Jupyter Notebook           0                 4       0      0
Python                     0                 0       4      0
other                      1                 0       1      6
----------------------------------------------------------------------------------------------
                  precision    recall  f1-score   support

      JavaScript       1.00      0.80      0.89         5
Jupyter Notebook       1.00      1.00      1.00         4
          Python       1.00      0.80      0.89         5
           other       0.75      1.00      0.86         6

        accuracy                           0.90        20
   

## Hypothesis testing

- $H_0$ there is no difference between number of links for javascript repos and the overall average of nuber of links.
- $H_a$ there is a difference between number of links for javascript repos and the overall average of nuber of links.

In [31]:
df.gen_language.value_counts()

other               8
Python              7
JavaScript          6
Jupyter Notebook    5
Name: gen_language, dtype: int64

In [11]:
from math import sqrt
from scipy import stats

In [23]:
df.groupby('gen_language').link_counts.mean()

gen_language
JavaScript          150.166667
Jupyter Notebook     30.800000
Python               40.857143
other                61.000000
Name: link_counts, dtype: float64

In [24]:
javascript = df[df.gen_language == 'JavaScript']
javascript.head()

Unnamed: 0,index,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,gen_language,without_numbers,num_words,num_unique_words,link_counts,py_extensions,js_extensions,ipynb_extensions
1,3,covid19india/covid19india-react,JavaScript,"<p align=""center"">\n<img src=""https://lh3.goog...",\n\n\n\n\n heres our data api\n \n\n setup\n\...,"[heres, our, data, api, setup, npm, i, npm, st...",here our data api setup npm i npm start mainta...,data api setup npm npm start maintainer jeremy...,JavaScript,data api setup npm npm start maintainer jeremy...,50,47,7,0,0,0
3,7,ahmadawais/corona-cli,JavaScript,"<h4 align=""center"">\n <a href=""https://gith...",\n \n \n \n \n \n\ntrack th...,"[track, the, coronavirus, disease, covid19, or...",track the coronavirus disease covid19 or the n...,track coronavirus disease covid19 novel corona...,JavaScript,track coronavirus disease covid19 novel corona...,823,413,70,0,2,0
7,11,soroushchehresa/awesome-coronavirus,JavaScript,"<div align=""center"">\n\t<br>\n\t<img src=""http...",\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\tsponsors\n\t\n...,"[&#9;, &#9;, &#9;, &#9;, &#9;, &#9;, &#9;, &#9...",sponsor awesome coronavirus awesom...,sponsor awesome coronavirus awesomehttpsawesom...,JavaScript,sponsor awesome coronavirus awesomehttpsawesom...,4290,1836,581,2,5,1
9,13,pomber/covid19,JavaScript,Transforms the data from [CSSEGISandData/COVID...,transforms the data from cssegisanddatacovid19...,"[transforms, the, data, from, cssegisanddataco...",transforms the data from cssegisanddatacovid19...,transforms data cssegisanddatacovid19httpsgith...,JavaScript,transforms data cssegisanddatacovid19httpsgith...,1541,717,235,0,15,0
18,22,javieraviles/covidAPI,JavaScript,# COVID API\nAPI for live information about CO...,covid api\napi for live information about cov...,"[covid, api, api, for, live, information, abou...",covid api api for live information about covid...,covid api api live information covid19 get htt...,JavaScript,covid api api live information covid19 get htt...,40,31,7,0,0,0


In [25]:
javascript.link_counts.mean()

150.16666666666666

In [21]:
df.link_counts.mean()

70.34615384615384

In [28]:
t, p = stats.ttest_1samp(javascript.link_counts, df.link_counts.mean())

print(f't = {t:.3f}')
print(f'p = {p:.3f}')


t = 0.853
p = 0.432


- $H_0$ there is no difference between number of words for python repos and the overall average of nuber of words for all repos.
- $H_a$ there is a difference between number of words for python repos and the overall average of nuber of words for all repos.

In [30]:
df.groupby('gen_language').num_words.mean()

gen_language
JavaScript          1127.833333
Jupyter Notebook     442.800000
Python               915.000000
other                341.000000
Name: num_words, dtype: float64

In [None]:
python = df[df.gen_language == 'Python']
python.head()

In [32]:
t, p = stats.ttest_1samp(python.num_words, df.num_words.mean())

print(f't = {t:.3f}')
print(f'p = {p:.3f}')

t = 0.619
p = 0.559
