In [1]:
import numpy as np
import pandas as pd

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer

import j_acquire
import j_prep

In [2]:
data = j_acquire.scrape_github_data('repo_source')
df = pd.DataFrame.from_dict(data)
df = df.dropna().reset_index()
df

Unnamed: 0,index,repo,language,readme_contents
0,1,tokyo-metropolitan-gov/covid19,Vue,# 東京都 新型コロナウイルス感染症対策サイト\n\n![](https://github....
1,3,covid19india/covid19india-react,JavaScript,"<p align=""center"">\n<img src=""https://lh3.goog..."
2,5,ieee8023/covid-chestxray-dataset,Jupyter Notebook,\n#### 🛑 Note: please do not claim diagnostic ...
3,7,ahmadawais/corona-cli,JavaScript,"<h4 align=""center"">\n <a href=""https://gith..."
4,8,ExpDev07/coronavirus-tracker-api,Python,"<h1 align=""center"">\n Coronavirus Tracker A..."
5,9,neherlab/covid19_scenarios,TypeScript,"<h1 id=""covid19_scenarios"" align=""center"">\n ..."
6,10,geohot/corona,Python,# Reverse engineering the coronavirus (SARS-Co...
7,11,soroushchehresa/awesome-coronavirus,JavaScript,"<div align=""center"">\n\t<br>\n\t<img src=""http..."
8,12,mhdhejazi/CoronaTracker,Swift,"<img src=""https://user-images.githubuserconten..."
9,13,pomber/covid19,JavaScript,Transforms the data from [CSSEGISandData/COVID...


In [3]:
df = j_prep.prep_readme_data(df)
df.head()

Unnamed: 0,index,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,link_counts
0,1,tokyo-metropolitan-gov/covid19,Vue,# 東京都 新型コロナウイルス感染症対策サイト\n\n![](https://github....,\n\nhttpsgithubcomtokyometropolitangovcovid1...,[httpsgithubcomtokyometropolitangovcovid19work...,httpsgithubcomtokyometropolitangovcovid19workf...,httpsgithubcomtokyometropolitangovcovid19workf...,1822
1,3,covid19india/covid19india-react,JavaScript,"<p align=""center"">\n<img src=""https://lh3.goog...",\n\n\n\n\n heres our data api\n \n\n setup\n\...,"[heres, our, data, api, setup, npm, i, npm, st...",here our data api setup npm i npm start mainta...,data api setup npm npm start maintainer jeremy...,1822
2,5,ieee8023/covid-chestxray-dataset,Jupyter Notebook,\n#### 🛑 Note: please do not claim diagnostic ...,\n note please do not claim diagnostic perfor...,"[note, please, do, not, claim, diagnostic, per...",note please do not claim diagnostic performanc...,note please claim diagnostic performance model...,1822
3,7,ahmadawais/corona-cli,JavaScript,"<h4 align=""center"">\n <a href=""https://gith...",\n \n \n \n \n \n\ntrack th...,"[track, the, coronavirus, disease, covid19, or...",track the coronavirus disease covid19 or the n...,track coronavirus disease covid19 novel corona...,1822
4,8,ExpDev07/coronavirus-tracker-api,Python,"<h1 align=""center"">\n Coronavirus Tracker A...",\n coronavirus tracker api\n\n\nprovides up...,"[coronavirus, tracker, api, provides, uptodate...",coronavirus tracker api provides uptodate data...,coronavirus tracker api provides uptodate data...,1822


In [4]:
df.language.value_counts()

Python              7
JavaScript          6
Jupyter Notebook    5
TypeScript          2
Ruby                1
Vue                 1
HTML                1
R                   1
PHP                 1
Swift               1
Name: language, dtype: int64

In [5]:
# Lets group all the languages with < 3 as other
# In this part, I want to combine each of the individual strings for each language into one long string
# Then I can count all the words that occur in each language

python_words = ' '.join(df[df.language == 'Python'].clean_lemmatized)
java_words = ' '.join(df[df.language == 'JavaScript'].clean_lemmatized)
jupyter_words = ' '.join(df[df.language == 'Jupyter Notebook'].clean_lemmatized)
other_words = ' '.join(df[(df.language != 'Jupyter Notebook') 
                          & (df.language != 'Python') & (df.language != 'JavaScript')].clean_lemmatized)
all_words = ' '.join(df.clean_lemmatized)

In [6]:
tokenizer = nltk.tokenize.ToktokTokenizer()

python_words_freq = tokenizer.tokenize(python_words)
java_words_freq = tokenizer.tokenize(java_words)
jupyter_words_freq = tokenizer.tokenize(jupyter_words)
other_words_freq = tokenizer.tokenize(other_words)
all_words_freq = tokenizer.tokenize(all_words)

In [7]:
python_words_freq = pd.Series(python_words_freq).value_counts()
java_words_freq = pd.Series(java_words_freq).value_counts()
jupyter_words_freq = pd.Series(jupyter_words_freq).value_counts()
other_words_freq = pd.Series(other_words_freq).value_counts()
all_words_freq = pd.Series(all_words_freq).value_counts()

In [8]:
python_words_freq

model                                               95
individual                                          76
network                                             70
rate                                                65
tika                                                52
                                                    ..
bioinformatics                                       1
investigate                                          1
redocshttpscoronavirustrackerapiherokuappcomdocs     1
darpa                                                1
cumulative                                           1
Length: 2100, dtype: int64

In [9]:
word_counts = (pd.concat([all_words_freq, python_words_freq, java_words_freq, 
                          jupyter_words_freq, other_words_freq], axis=1, sort=True)
              .set_axis(['all', 'python', 'java', 'jupyter', 'other'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

word_counts.head()

Unnamed: 0,all,python,java,jupyter,other
0,58,48,10,0,0
0,28,0,0,28,0
2,8,8,0,0,0
1,4,4,0,0,0
2,8,8,0,0,0


In [10]:
word_counts.sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,python,java,jupyter,other
covid19,436,37,359,21,19
data,290,44,143,34,69
coronavirus,224,15,195,1,13
case,158,47,86,15,10
country,136,11,97,18,10
model,134,95,11,17,11
source,81,20,31,7,23
individual,79,76,1,1,1
rate,73,65,7,0,1
number,72,36,23,8,5


In [11]:
word_counts.sort_values(by='python', ascending=False).head(10)

Unnamed: 0,all,python,java,jupyter,other
model,134,95,11,17,11
individual,79,76,1,1,1
network,70,70,0,0,0
rate,73,65,7,0,1
tika,52,52,0,0,0
parameter,71,49,2,1,19
0,58,48,10,0,0
case,158,47,86,15,10
data,290,44,143,34,69
time,72,43,23,3,3


In [12]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(df.clean_lemmatized)
vectorized_df = pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())
vectorized_df

Unnamed: 0,00,002,01,02,04162020,05,05132020,05262020,06012020,0no,...,zero,zhang,zhao,zhidong,zhong,zhou,zinc,zingming,zoom,zum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.630423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019723,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.014731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.150628,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.006085,0.0,0.0,0.0,0.006085
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108472,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011377,0.0


In [13]:
len(set(tokenizer.tokenize(df.clean_lemmatized.iloc[2])))

384

In [14]:
tokenizer.tokenize(df.clean_lemmatized.iloc[2])

['note',
 'please',
 'claim',
 'diagnostic',
 'performance',
 'model',
 'without',
 'clinical',
 'study',
 'kaggle',
 'competition',
 'dataset',
 'please',
 'read',
 'paper',
 'evaluation',
 'issue',
 'httpsarxivorgabs200412823httpsarxivorgabs200412823',
 'covid19',
 'image',
 'data',
 'collection',
 'video',
 'projecthttpswwwyoutubecomwatchvinewmqfeleq',
 'project',
 'summary',
 'build',
 'public',
 'open',
 'dataset',
 'chest',
 'xray',
 'ct',
 'image',
 'patient',
 'positive',
 'suspected',
 'covid19',
 'viral',
 'bacterial',
 'pneumonia',
 'mershttpsenwikipediaorgwikimiddle_east_respiratory_syndrome',
 'sarshttpsenwikipediaorgwikisevere_acute_respiratory_syndrome',
 'ardshttpsenwikipediaorgwikiacute_respiratory_distress_syndrome',
 'data',
 'collected',
 'public',
 'source',
 'well',
 'indirect',
 'collection',
 'hospital',
 'physician',
 'image',
 'data',
 'released',
 'publicly',
 'github',
 'repo',
 'project',
 'approved',
 'university',
 'montreal',
 'ethic',
 'committee',
 'ce

In [15]:
df['number_of_words'] = df.clean_lemmatized.apply(lambda x: len(tokenizer.tokenize(x)))
df['num_unique_words'] = df.clean_lemmatized.apply(lambda x: len(set(tokenizer.tokenize(x))))
df.head()

Unnamed: 0,index,repo,language,readme_contents,basic_clean,clean_tokes,lemmatized,clean_lemmatized,link_counts,number_of_words,num_unique_words
0,1,tokyo-metropolitan-gov/covid19,Vue,# 東京都 新型コロナウイルス感染症対策サイト\n\n![](https://github....,\n\nhttpsgithubcomtokyometropolitangovcovid1...,[httpsgithubcomtokyometropolitangovcovid19work...,httpsgithubcomtokyometropolitangovcovid19workf...,httpsgithubcomtokyometropolitangovcovid19workf...,1822,19,19
1,3,covid19india/covid19india-react,JavaScript,"<p align=""center"">\n<img src=""https://lh3.goog...",\n\n\n\n\n heres our data api\n \n\n setup\n\...,"[heres, our, data, api, setup, npm, i, npm, st...",here our data api setup npm i npm start mainta...,data api setup npm npm start maintainer jeremy...,1822,50,47
2,5,ieee8023/covid-chestxray-dataset,Jupyter Notebook,\n#### 🛑 Note: please do not claim diagnostic ...,\n note please do not claim diagnostic perfor...,"[note, please, do, not, claim, diagnostic, per...",note please do not claim diagnostic performanc...,note please claim diagnostic performance model...,1822,663,384
3,7,ahmadawais/corona-cli,JavaScript,"<h4 align=""center"">\n <a href=""https://gith...",\n \n \n \n \n \n\ntrack th...,"[track, the, coronavirus, disease, covid19, or...",track the coronavirus disease covid19 or the n...,track coronavirus disease covid19 novel corona...,1822,823,413
4,8,ExpDev07/coronavirus-tracker-api,Python,"<h1 align=""center"">\n Coronavirus Tracker A...",\n coronavirus tracker api\n\n\nprovides up...,"[coronavirus, tracker, api, provides, uptodate...",coronavirus tracker api provides uptodate data...,coronavirus tracker api provides uptodate data...,1822,1032,480


In [16]:
tokenizer.tokenize(df.clean_lemmatized.iloc[6])


['reverse',
 'engineering',
 'coronavirus',
 'sarscov2',
 'start',
 'coronapycoronapy',
 'thought_balloon',
 'background',
 'project',
 'applies',
 'technique',
 'reverse',
 'engineeringhttpsenwikipediaorgwikireverse_engineering',
 'understand',
 'sarscov2httpsenwikipediaorgwikisevere_acute_respiratory_syndrome_coronavirus_2',
 'virus',
 'goal',
 'simply',
 'build',
 'understanding',
 'virus',
 'first',
 'principle',
 'biology',
 'v',
 'software',
 'biological',
 'system',
 'fundamentally',
 'information',
 'processing',
 'systemshttpsenwikipediaorgwikiinformation_processor',
 'perfect',
 'analogy',
 'software',
 'provides',
 'useful',
 'framework',
 'thinking',
 'biology',
 'table',
 'provides',
 'rough',
 'outline',
 'analogy',
 'microscope',
 'biology',
 'computer',
 'software',
 'note',
 'nucleotidehttpsenwikipediaorgwikinucleotide',
 'bytehttpsenwikipediaorgwikibyte',
 'genomehttpsenwikipediaorgwikigenome',
 'bytecodehttpsenwikipediaorgwikibytecode',
 'translationhttpsenwikipediao

In [17]:
vectorized_df = vectorized_df.join(df[['link_counts', 'number_of_words', 'num_unique_words']], how='left')


In [18]:
vectorized_df

Unnamed: 0,00,002,01,02,04162020,05,05132020,05262020,06012020,0no,...,zhidong,zhong,zhou,zinc,zingming,zoom,zum,link_counts,number_of_words,num_unique_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1822,19,19
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1822,50,47
2,0.630423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1822,663,384
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019723,0.0,0.0,0.0,0.0,0.0,0.0,1822,823,413
4,0.0,0.0,0.014731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1822,1032,480
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1822,1015,629
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.150628,0.0,0.0,0.0,1822,464,345
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006085,0.0,0.0,0.0,0.006085,1822,4259,1821
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.108472,0.0,1822,301,255
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011377,0.0,1822,1541,717
