In [14]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("total_scraped_data_v3.csv")
df = df.drop(columns=['Unnamed: 0'])
df.shape

(8506, 16)

In [5]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count
0,1.0,5524,312.0,2020-05-19T02:37:13Z,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09T17:53:54Z,https://github.com/ManimCommunity/manim,9951.0
1,1.0,224,16.0,2017-08-25T12:05:15Z,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09T09:30:18Z,https://github.com/zalandoresearch/fashion-mnist,9998.0
2,1.0,133,3.0,2018-01-09T09:48:49Z,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09T14:13:08Z,https://github.com/Embedding/Chinese-Word-Vectors,9976.0
3,1.0,6023,18.0,2018-07-11T18:28:58Z,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09T15:50:27Z,https://github.com/PySimpleGUI/PySimpleGUI,9924.0
4,1.0,541,9.0,2016-05-29T13:29:44Z,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09T11:56:21Z,https://github.com/MorvanZhou/tutorials,9944.0


In [6]:
df.dtypes

allow_forking        float64
commit_count           int64
contributor_count    float64
created_at            object
forks                float64
github_pages         float64
license               object
name                  object
open_issues          float64
size                 float64
star_count           float64
topics_count         float64
total_issue_count    float64
updated_at            object
url                   object
watchers_count       float64
dtype: object

## Filling Null Values

In [7]:
df.isnull().sum()

allow_forking          0
commit_count           0
contributor_count    108
created_at             0
forks                  0
github_pages           0
license                0
name                   0
open_issues            0
size                   0
star_count             0
topics_count           0
total_issue_count      0
updated_at             0
url                    0
watchers_count         0
dtype: int64

In [8]:
for i in range(df.shape[0]):
    if df.loc[i, "total_issue_count"] == 0:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]
    elif df.loc[i, "total_issue_count"] < df.loc[i, "open_issues"]:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]

In [9]:
df["commit_count"] = df["commit_count"].replace(0, df["commit_count"].mean())
df["contributor_count"] = df["contributor_count"].fillna(df["contributor_count"].mean())

In [10]:
df[df["contributor_count"] == 0]

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count


In [11]:
df.isnull().sum()

allow_forking        0
commit_count         0
contributor_count    0
created_at           0
forks                0
github_pages         0
license              0
name                 0
open_issues          0
size                 0
star_count           0
topics_count         0
total_issue_count    0
updated_at           0
url                  0
watchers_count       0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count
count,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0
mean,1.0,2933.376,90.902515,871.199036,0.214907,132.042558,81569.63,4617.587233,5.197625,650.399365,4617.587233
std,0.0,24305.93,507.283919,1126.250497,0.410782,318.062934,849439.1,2703.210825,5.349499,1770.912563,2703.210825
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0
25%,1.0,143.0,12.0,303.0,0.0,15.0,1195.25,1982.0,0.0,64.0,1982.0
50%,1.0,481.5,36.0,571.0,0.0,47.0,6698.0,3996.0,4.0,210.0,3996.0
75%,1.0,1583.5,90.902515,1090.0,0.0,131.0,35584.0,6946.0,8.0,610.0,6946.0
max,1.0,1052496.0,42941.0,33854.0,1.0,8591.0,57016100.0,10280.0,20.0,63929.0,10280.0


## Feature Engineering

### Add how many years the project has been developed

In [15]:
df["created_at"] = pd.to_datetime(df["created_at"])
df["updated_at"] = pd.to_datetime(df["updated_at"])
df["years"] = (df["updated_at"] - df["created_at"]) / np.timedelta64(1, 'Y')
df["years"] = df["years"].astype(np.int32)

### Find the closed issue count, add closed issue rate (closed / all)

In [16]:
df["closed_issue"] = df["total_issue_count"] - df["open_issues"]
df["issue_close_rate"] = df["closed_issue"] / df["total_issue_count"]
df["issue_close_rate"] = df["issue_close_rate"].fillna(0)
df[["closed_issue", "issue_close_rate", "total_issue_count"]]

Unnamed: 0,closed_issue,issue_close_rate,total_issue_count
0,679.0,0.659223,1030.0
1,74.0,0.755102,98.0
2,105.0,0.714286,147.0
3,2091.0,0.760364,2750.0
4,53.0,0.803030,66.0
...,...,...,...
8501,370.0,0.943878,392.0
8502,0.0,0.000000,204.0
8503,1317.0,0.961314,1370.0
8504,8.0,0.888889,9.0


In [17]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate
count,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0
mean,1.0,2933.376,90.902515,871.199036,0.214907,132.042558,81569.63,4617.587233,5.197625,650.399365,4617.587233,5.717729,518.356807,0.661832
std,0.0,24305.93,507.283919,1126.250497,0.410782,318.062934,849439.1,2703.210825,5.349499,1770.912563,2703.210825,2.959044,1541.221495,0.271235
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0,0.0,0.0,0.0
25%,1.0,143.0,12.0,303.0,0.0,15.0,1195.25,1982.0,0.0,64.0,1982.0,3.0,32.0,0.496309
50%,1.0,481.5,36.0,571.0,0.0,47.0,6698.0,3996.0,4.0,210.0,3996.0,5.0,132.0,0.727273
75%,1.0,1583.5,90.902515,1090.0,0.0,131.0,35584.0,6946.0,8.0,610.0,6946.0,8.0,457.0,0.881055
max,1.0,1052496.0,42941.0,33854.0,1.0,8591.0,57016100.0,10280.0,20.0,63929.0,10280.0,14.0,58901.0,1.0


### Mapping Licence to Commercial Use

In [18]:
df["license"].value_counts()

MIT License                                                   3101
Apache License 2.0                                            1375
Other                                                         1338
Not Found                                                     1050
GNU General Public License v3.0                                556
BSD 3-Clause "New" or "Revised" License                        355
GNU General Public License v2.0                                148
GNU Affero General Public License v3.0                         127
BSD 2-Clause "Simplified" License                              120
Creative Commons Zero v1.0 Universal                            72
GNU Lesser General Public License v3.0                          52
Mozilla Public License 2.0                                      52
The Unlicense                                                   30
Creative Commons Attribution Share Alike 4.0 International      24
GNU Lesser General Public License v2.1                        

All of the licences are free for commercial usage, therefore this is not useful. Will drop `license` column

In [19]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count,years,closed_issue,issue_close_rate
0,1.0,5524.0,312.0,2020-05-19 02:37:13+00:00,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09 17:53:54+00:00,https://github.com/ManimCommunity/manim,9951.0,1,679.0,0.659223
1,1.0,224.0,16.0,2017-08-25 12:05:15+00:00,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09 09:30:18+00:00,https://github.com/zalandoresearch/fashion-mnist,9998.0,4,74.0,0.755102
2,1.0,133.0,3.0,2018-01-09 09:48:49+00:00,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09 14:13:08+00:00,https://github.com/Embedding/Chinese-Word-Vectors,9976.0,4,105.0,0.714286
3,1.0,6023.0,18.0,2018-07-11 18:28:58+00:00,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09 15:50:27+00:00,https://github.com/PySimpleGUI/PySimpleGUI,9924.0,3,2091.0,0.760364
4,1.0,541.0,9.0,2016-05-29 13:29:44+00:00,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09 11:56:21+00:00,https://github.com/MorvanZhou/tutorials,9944.0,5,53.0,0.80303


`Watcher Count` is directly related with the star count, therefore we drop that column

## Categorize the star counts

In [20]:
df["star_category"] = df["star_count"].map(lambda x: (x // 1000) + 1)

In [21]:
df["star_category"].value_counts()

2.0     1765
4.0     1192
3.0      909
8.0      852
5.0      827
7.0      698
6.0      670
9.0      615
10.0     553
1.0      408
11.0      17
Name: star_category, dtype: int64

In [22]:
df = df[df["star_category"] < 11] # we want star counts between 0- 10000. drop anything above 100000

In [23]:
df["star_category"].value_counts()

2.0     1765
4.0     1192
3.0      909
8.0      852
5.0      827
7.0      698
6.0      670
9.0      615
10.0     553
1.0      408
Name: star_category, dtype: int64

In [25]:
df2 = df.groupby('star_category').apply(lambda x: x.sample(408))

In [113]:
df2.corr()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate,star_category
allow_forking,,,,,,,,,,,,,,,
commit_count,,1.0,0.42312,0.224568,-0.02712,0.289193,0.297728,0.070852,-0.015727,0.375701,0.070852,0.083322,0.374153,0.044375,0.071221
contributor_count,,0.42312,1.0,0.2869,0.002663,0.471497,0.159023,0.178423,0.011627,0.417167,0.178423,0.202527,0.384761,0.165504,0.175617
forks,,0.224568,0.2869,1.0,0.03811,0.24515,0.146593,0.402712,-0.000658,0.28942,0.402712,0.143346,0.283663,-0.015202,0.401257
github_pages,,-0.02712,0.002663,0.03811,1.0,-0.012393,-0.009838,0.134688,0.018416,-0.007828,0.134688,0.06833,-0.006497,0.060094,0.132316
open_issues,,0.289193,0.471497,0.24515,-0.012393,1.0,0.144496,0.17056,0.025231,0.788317,0.17056,0.111258,0.704833,-0.026553,0.170324
size,,0.297728,0.159023,0.146593,-0.009838,0.144496,1.0,0.032843,0.040689,0.255623,0.032843,0.000566,0.265235,0.019975,0.034169
star_count,,0.070852,0.178423,0.402712,0.134688,0.17056,0.032843,1.0,0.087156,0.171657,1.0,0.156306,0.163109,0.106613,0.994323
topics_count,,-0.015727,0.011627,-0.000658,0.018416,0.025231,0.040689,0.087156,1.0,0.055545,0.087156,-0.210284,0.058892,0.196004,0.088237
total_issue_count,,0.375701,0.417167,0.28942,-0.007828,0.788317,0.255623,0.171657,0.055545,1.0,0.171657,0.136209,0.992088,0.17114,0.172322


In [26]:
df2["star_category"].value_counts()

1.0     408
2.0     408
3.0     408
4.0     408
5.0     408
6.0     408
7.0     408
8.0     408
9.0     408
10.0    408
Name: star_category, dtype: int64

In [114]:
X = df2.drop(columns=["created_at", "license", "name", "updated_at", "url", "watchers_count", "star_count", "star_category"])
Y = df2["star_category"].values

In [115]:
X.shape, Y.shape

((4080, 13), (4080,))

In [116]:
X.corr()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate
allow_forking,,,,,,,,,,,,,
commit_count,,1.0,0.42312,0.224568,-0.02712,0.289193,0.297728,-0.015727,0.375701,0.070852,0.083322,0.374153,0.044375
contributor_count,,0.42312,1.0,0.2869,0.002663,0.471497,0.159023,0.011627,0.417167,0.178423,0.202527,0.384761,0.165504
forks,,0.224568,0.2869,1.0,0.03811,0.24515,0.146593,-0.000658,0.28942,0.402712,0.143346,0.283663,-0.015202
github_pages,,-0.02712,0.002663,0.03811,1.0,-0.012393,-0.009838,0.018416,-0.007828,0.134688,0.06833,-0.006497,0.060094
open_issues,,0.289193,0.471497,0.24515,-0.012393,1.0,0.144496,0.025231,0.788317,0.17056,0.111258,0.704833,-0.026553
size,,0.297728,0.159023,0.146593,-0.009838,0.144496,1.0,0.040689,0.255623,0.032843,0.000566,0.265235,0.019975
topics_count,,-0.015727,0.011627,-0.000658,0.018416,0.025231,0.040689,1.0,0.055545,0.087156,-0.210284,0.058892,0.196004
total_issue_count,,0.375701,0.417167,0.28942,-0.007828,0.788317,0.255623,0.055545,1.0,0.171657,0.136209,0.992088,0.17114
watchers_count,,0.070852,0.178423,0.402712,0.134688,0.17056,0.032843,0.087156,0.171657,1.0,0.156306,0.163109,0.106613


## Preparing Train and Test Data
We will split train to test as 80%, 20&

In [117]:
from sklearn.model_selection import train_test_split

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=50, test_size=0.2)

In [119]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3264, 13), (3264,), (816, 13), (816,))

## Apply Baseline Machine Learning Model

In [120]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [121]:
for i in range(1, 21):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    print(f"for neighbour: {i} : accuracy: {accuracy_score(y_test, preds)}")

for neighbour: 1 : accuracy: 0.6703431372549019
for neighbour: 2 : accuracy: 0.6213235294117647
for neighbour: 3 : accuracy: 0.6017156862745098
for neighbour: 4 : accuracy: 0.5992647058823529
for neighbour: 5 : accuracy: 0.5919117647058824
for neighbour: 6 : accuracy: 0.5943627450980392
for neighbour: 7 : accuracy: 0.5931372549019608
for neighbour: 8 : accuracy: 0.5894607843137255
for neighbour: 9 : accuracy: 0.5919117647058824
for neighbour: 10 : accuracy: 0.5882352941176471
for neighbour: 11 : accuracy: 0.5747549019607843
for neighbour: 12 : accuracy: 0.571078431372549
for neighbour: 13 : accuracy: 0.5821078431372549
for neighbour: 14 : accuracy: 0.5808823529411765
for neighbour: 15 : accuracy: 0.5857843137254902
for neighbour: 16 : accuracy: 0.5674019607843137
for neighbour: 17 : accuracy: 0.5821078431372549
for neighbour: 18 : accuracy: 0.5759803921568627
for neighbour: 19 : accuracy: 0.5759803921568627
for neighbour: 20 : accuracy: 0.5821078431372549


In [122]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

models = {
    "SVC": svm.SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": tree.DecisionTreeClassifier(),
}

for name, model in models.items():

    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    print(f"Accuracy score for {name} : {accuracy_score(y_test, preds)}")

Accuracy score for SVC : 0.09558823529411764
Accuracy score for Naive Bayes : 0.7904411764705882
Accuracy score for Decision Tree : 1.0


In [123]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, preds)

array([[ 73,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,  86,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  75,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  89,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  79,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,  71,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,  72,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  86,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 101,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  84]])

In [135]:
import github3
import os
from tqdm import tqdm

In [128]:
token = os.getenv("STAR_GITHUB_TOKEN")

In [138]:
gh = github3.login(token=token)

In [139]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count,years,closed_issue,issue_close_rate,star_category
0,1.0,5524.0,312.0,2020-05-19 02:37:13+00:00,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09 17:53:54+00:00,https://github.com/ManimCommunity/manim,9951.0,1,679.0,0.659223,10.0
1,1.0,224.0,16.0,2017-08-25 12:05:15+00:00,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09 09:30:18+00:00,https://github.com/zalandoresearch/fashion-mnist,9998.0,4,74.0,0.755102,10.0
2,1.0,133.0,3.0,2018-01-09 09:48:49+00:00,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09 14:13:08+00:00,https://github.com/Embedding/Chinese-Word-Vectors,9976.0,4,105.0,0.714286,10.0
3,1.0,6023.0,18.0,2018-07-11 18:28:58+00:00,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09 15:50:27+00:00,https://github.com/PySimpleGUI/PySimpleGUI,9924.0,3,2091.0,0.760364,10.0
4,1.0,541.0,9.0,2016-05-29 13:29:44+00:00,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09 11:56:21+00:00,https://github.com/MorvanZhou/tutorials,9944.0,5,53.0,0.80303,10.0


In [141]:
pr_counts = []
for n in tqdm(df["name"].values):
    [owner, repo_name] = n.split("/")
    cur_pr_count = len(list(gh.repository(owner, repo_name).pull_requests()))
    pr_counts.append(cur_pr_count)

  0%|▎                                                                                                                  | 20/8489 [01:03<7:29:47,  3.19s/it]


KeyboardInterrupt: 