In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

from math import sqrt
from scipy import stats

import os
import unicodedata
import re
import json

import j_acquire
import j_prep
import explore

# code to regulate the size of plots for the rest of notebook
plt.rc('figure', figsize=(13, 10))
plt.rc('font', size=13)

In [2]:
df = explore.make_initial_df()
df = explore.add_new_columns(df)
df.head()

Unnamed: 0,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,gen_language,without_numbers,num_words,num_unique_words,link_counts,py_extensions,js_extensions,ipynb_extensions
0,CyC2018/CS-Notes,Java,"<div align=""center"">\r\n <a href=""https://g...",\r\n \r\n \r\n \r\n \r\n\r\n\r...,"[nbspnbsp, nbspnbsp, nbspnbspnbspnbspnbspnbspn...",nbspnbsp nbspnbsp nbspnbspnbspnbspnbspnbspnbsp...,nbspnbsp nbspnbsp nbspnbspnbspnbspnbspnbspnbsp...,Java,nbspnbsp nbspnbsp nbspnbspnbspnbspnbspnbspnbsp...,76,52,76,0,0,0
1,TheAlgorithms/Python,Python,# The Algorithms - Python\n[![Gitpod Ready-to-...,the algorithms python\ngitpod readytocodehtt...,"[the, algorithms, python, gitpod, readytocodeh...",the algorithm python gitpod readytocodehttpsim...,algorithm python gitpod readytocodehttpsimgshi...,Python,algorithm python gitpod readytocodehttpsimgshi...,43,36,17,0,0,0
2,trekhleb/javascript-algorithms,JavaScript,# Algoritmos y Estructuras de Datos en JavaScr...,algoritmos y estructuras de datos en javascri...,"[algoritmos, y, estructuras, de, datos, en, ja...",algoritmos y estructuras de datos en javascrip...,algoritmos estructuras de datos en javascript ...,JavaScript,algoritmos estructuras de datos en javascript ...,1718,593,7,0,2,0
3,kdn251/interviews,Java,> * 原文地址：[github.com/kdn251/interviews](https:...,githubcomkdn251interviewshttpsgithubcomkdn25...,[githubcomkdn251interviewshttpsgithubcomkdn251...,githubcomkdn251interviewshttpsgithubcomkdn251i...,githubcomkdn251interviewshttpsgithubcomkdn251i...,Java,githubcomkdn251interviewshttpsgithubcomkdn251i...,443,293,29,0,0,0
4,yangshun/tech-interview-handbook,JavaScript,"<h1 align=""center"">Tech Interview Handbook</h1...",tech interview handbook\n\n\n \n \n \n \...,"[tech, interview, handbook, credits, illustrat...",tech interview handbook credit illustration by...,tech interview handbook credit illustration ya...,JavaScript,tech interview handbook credit illustration ya...,364,235,55,0,0,0


In [3]:
df.shape

(298, 15)

In [4]:
word_counts_df = explore.make_word_counts_df(df)
word_counts_df.head()

Unnamed: 0,all,python,javascript,jupyter,c_plus,typescript,java,other
0,537,348,88,15,59,3,4,20
0,30,0,0,30,0,0,0,0
0,2,2,0,0,0,0,0,0
0,7,1,0,3,2,0,0,1
0,2,0,0,2,0,0,0,0


In [5]:
vectorized_df = explore.make_vectorized_df(df)
vectorized_df.head()

Unnamed: 0,00,000,0000,000000,00008100,0001twosumproblems1twosumenmd,0002,0003,0004732,0004medianoftwosortedarrayproblems4medianoftwosortedarraymd,...,zxf,zxvf,zynga,zyszyshttpsgithubcomzyszys,num_words,num_unique_words,link_counts,py_extensions,js_extensions,ipynb_extensions
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,76,52,76,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,43,36,17,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1718,593,7,0,2,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,443,293,29,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,364,235,55,0,0,0


In [6]:
vectorized_df.shape

(298, 41839)

In [7]:
X_train_reduced, X_test_reduced = explore.prep_vectorized_df(df, vectorized_df)
X_train_reduced.shape

(208, 233)

- 99% confident
- $\alpha = 1 - .99 = .01$
- p: P(data|$H_0$)
- $p < \alpha$ -> reject $H_0$
- $p >= \alpha$ -> fail to reject $H_0$

## Hypothesis testing
- $H_0$ there is no difference between number of links for javascript repos and the overall average of nuber of links.
- $H_a$ there is a difference between number of links for javascript repos and the overall average of nuber of links.

In [8]:
X_train_reduced.head()

Unnamed: 0,10,100,36,add,additional,already,also,alternatively,analysis,api,...,window,without,work,working,would,written,num_words,num_unique_words,link_counts,py_extensions
279,0.0,0.0,0.0,0.136945,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.024621,0.06449,0.043984,0.020833
156,0.437619,0.0,0.314872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.26541,0.199879,0.0,0.0,0.053909,0.099184,0.023286,0.0
31,0.0,0.0,0.0,0.08071,0.0,0.0,0.132512,0.0,0.0,0.0,...,0.161485,0.0,0.076812,0.17354,0.281604,0.722199,0.044457,0.098776,0.075032,0.604167
125,0.0,0.0,0.0,0.182792,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.77448,0.0,0.0,0.0,0.0,0.016803,0.042449,0.01423,0.020833
110,0.0,0.0,0.0,0.0,0.441224,0.0,0.0,0.0,0.0,0.110689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.029988,0.076735,0.03881,0.0625


In [9]:
df.gen_language.value_counts()

Python              90
JavaScript          65
Jupyter Notebook    47
other               38
C++                 23
TypeScript          22
Java                13
Name: gen_language, dtype: int64

In [10]:
javascript = df[df.gen_language == 'JavaScript']
javascript.head()

Unnamed: 0,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,gen_language,without_numbers,num_words,num_unique_words,link_counts,py_extensions,js_extensions,ipynb_extensions
2,trekhleb/javascript-algorithms,JavaScript,# Algoritmos y Estructuras de Datos en JavaScr...,algoritmos y estructuras de datos en javascri...,"[algoritmos, y, estructuras, de, datos, en, ja...",algoritmos y estructuras de datos en javascrip...,algoritmos estructuras de datos en javascript ...,JavaScript,algoritmos estructuras de datos en javascript ...,1718,593,7,0,2,0
4,yangshun/tech-interview-handbook,JavaScript,"<h1 align=""center"">Tech Interview Handbook</h1...",tech interview handbook\n\n\n \n \n \n \...,"[tech, interview, handbook, credits, illustrat...",tech interview handbook credit illustration by...,tech interview handbook credit illustration ya...,JavaScript,tech interview handbook credit illustration ya...,364,235,55,0,0,0
5,azl397985856/leetcode,JavaScript,# LeetCode\n\n[![Travis](https://img.shields.i...,leetcode\n\ntravishttpsimgshieldsiobadgelangu...,"[leetcode, travishttpsimgshieldsiobadgelanguag...",leetcode travishttpsimgshieldsiobadgelanguagec...,leetcode travishttpsimgshieldsiobadgelanguagec...,JavaScript,leetcode travishttpsimgshieldsiobadgelanguagec...,632,442,17,0,0,0
6,algorithm-visualizer/algorithm-visualizer,JavaScript,# Algorithm Visualizer\n\n> Algorithm Visualiz...,algorithm visualizer\n\n algorithm visualizer...,"[algorithm, visualizer, algorithm, visualizer,...",algorithm visualizer algorithm visualizer is a...,algorithm visualizer algorithm visualizer inte...,JavaScript,algorithm visualizer algorithm visualizer inte...,84,61,10,0,0,0
18,mgechev/javascript-algorithms,JavaScript,## About\n\n[![Build Status](https://travis-ci...,about\n\nbuild statushttpstravisciorgmgechevj...,"[about, build, statushttpstravisciorgmgechevja...",about build statushttpstravisciorgmgechevjavas...,build statushttpstravisciorgmgechevjavascripta...,JavaScript,build statushttpstravisciorgmgechevjavascripta...,198,154,123,0,1,0


In [11]:
t, p = stats.ttest_1samp(javascript.link_counts, df.link_counts.mean())

print(f't = {t:.3f}')
print(f'p = {p:.3f}')

t = 1.203
p = 0.233


- $H_0$ there is no difference between number of words for python repos and the overall average of nuber of words for all repos.
- $H_a$ there is a difference between number of words for python repos and the overall average of nuber of words for all repos.

In [12]:
df.groupby('gen_language').num_words.mean()

gen_language
C++                 914.086957
Java                611.076923
JavaScript          959.107692
Jupyter Notebook    680.361702
Python              882.622222
TypeScript          326.636364
other               752.842105
Name: num_words, dtype: float64

In [13]:
python = df[df.gen_language == 'Python']
python.head()

Unnamed: 0,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,gen_language,without_numbers,num_words,num_unique_words,link_counts,py_extensions,js_extensions,ipynb_extensions
1,TheAlgorithms/Python,Python,# The Algorithms - Python\n[![Gitpod Ready-to-...,the algorithms python\ngitpod readytocodehtt...,"[the, algorithms, python, gitpod, readytocodeh...",the algorithm python gitpod readytocodehttpsim...,algorithm python gitpod readytocodehttpsimgshi...,Python,algorithm python gitpod readytocodehttpsimgshi...,43,36,17,0,0,0
8,donnemartin/interactive-coding-challenges,Python,"<br/>\n<p align=""center"">\n <img src=""https:/...",\n\n \n\n\ninteractivecodingchallenges\n\n\n1...,"[interactivecodingchallenges, 120, continually...",interactivecodingchallenges 120 continually up...,interactivecodingchallenges 120 continually up...,Python,interactivecodingchallenges 120 continually up...,1520,698,356,4,0,131
9,keon/algorithms,Python,[![PyPI version](https://badge.fury.io/py/algo...,pypi versionhttpsbadgefuryiopyalgorithmssvghtt...,"[pypi, versionhttpsbadgefuryiopyalgorithmssvgh...",pypi versionhttpsbadgefuryiopyalgorithmssvghtt...,pypi versionhttpsbadgefuryiopyalgorithmssvghtt...,Python,pypi versionhttpsbadgefuryiopyalgorithmssvghtt...,449,409,10,305,0,0
14,apachecn/awesome-algorithm,Python,"<div align=""center"">\n <a href=""https://www...",\n \n \n \n \n \n\n\n...,"[1, 2, httpsgithubcomjiangzhonglian, httpsgith...",1 2 httpsgithubcomjiangzhonglian httpsgithubco...,1 2 httpsgithubcomjiangzhonglian httpsgithubco...,Python,1 2 httpsgithubcomjiangzhonglian httpsgithubco...,25,12,17,0,0,0
22,OlafenwaMoses/ImageAI,Python,# ImageAI (v2.1.5)\n\n![Discourse status](http...,imageai v215\n\ndiscourse statushttpsimgshiel...,"[imageai, v215, discourse, statushttpsimgshiel...",imageai v215 discourse statushttpsimgshieldsio...,imageai v215 discourse statushttpsimgshieldsio...,Python,imageai v215 discourse statushttpsimgshieldsio...,1309,586,81,0,0,0


In [14]:
t, p = stats.ttest_1samp(python.num_words, df.num_words.mean())

print(f't = {t:.3f}')
print(f'p = {p:.3f}')

t = 0.764
p = 0.447


In [15]:
# df.link_counts.sort_values()

In [16]:
# cut_labels = ['small', 'medium', 'large']
# cut_bins = [0, 45, 100, 800]
# df['link_bins'] = pd.cut(df['link_counts'], bins=cut_bins, labels=cut_labels)

In [17]:
# df.link_bins.value_counts()

In [18]:
# plt.hist(df.link_counts, bins = 500)

In [19]:
# df.isnull().sum()

In [20]:
# df[df['link_bins'].isnull()]

In [21]:
# df['link_bins'].fillna('small', inplace = True) 

In [22]:
# df.isnull().sum()

In [23]:
# df[df['link_bins'].isnull()]

In [24]:
# plt.hist(df.num_words, bins = 200)

In [25]:
# cut_labels = ['small', 'medium', 'large']
# cut_bins = [0, 1600, 2200, 9000]
# df['word_bins'] = pd.cut(df['num_words'], bins=cut_bins, labels=cut_labels)

In [26]:
# df[df['link_bins'].isnull()]

In [27]:
# def bin_link_counts(df):
#     '''
#     Takes in a df and returns it with a column showing if the
#     number of links is small, medium or large
#     '''
#     cut_labels = ['small', 'medium', 'large']
#     cut_bins = [0, 45, 100, 800]
#     df['link_bins'] = pd.cut(df['link_counts'], bins=cut_bins, labels=cut_labels)
#     df['link_bins'].fillna('small', inplace = True) 
#     return df
    

In [28]:
# def bin_word_counts(df):
#     '''
#     Takes in a df and returns it with a column showing if the
#     number of words is small, medium or large
#     '''
#     cut_labels = ['small', 'medium', 'large']
#     cut_bins = [0, 1600, 2200, 9000]
#     df['word_bins'] = pd.cut(df['num_words'], bins=cut_bins, labels=cut_labels)
#     return df

In [29]:
df = explore.bin_link_counts(df)
df = explore.bin_word_counts(df)