# Reading Data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def create_dataframe():
    df = pd.DataFrame()
    cwd = os.getcwd()
    data_folder = os.path.join(cwd, "data")
    for i in range(41):
        filename = f"python_100_10000_{i}.csv"
        filename = os.path.join(data_folder, filename)
        df_sub = pd.read_csv(filename)
        df = pd.concat([df, df_sub], ignore_index=True)

    df = df.drop(columns=['Unnamed: 0'])
    return df

In [3]:
df = create_dataframe()
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count
0,1.0,5524,312.0,2020-05-19T02:37:13Z,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09T17:53:54Z,https://github.com/ManimCommunity/manim,9951.0
1,1.0,224,16.0,2017-08-25T12:05:15Z,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09T09:30:18Z,https://github.com/zalandoresearch/fashion-mnist,9998.0
2,1.0,133,3.0,2018-01-09T09:48:49Z,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09T14:13:08Z,https://github.com/Embedding/Chinese-Word-Vectors,9976.0
3,1.0,6023,18.0,2018-07-11T18:28:58Z,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09T15:50:27Z,https://github.com/PySimpleGUI/PySimpleGUI,9924.0
4,1.0,541,9.0,2016-05-29T13:29:44Z,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09T11:56:21Z,https://github.com/MorvanZhou/tutorials,9944.0


In [4]:
df.dtypes

allow_forking        float64
commit_count           int64
contributor_count    float64
created_at            object
forks                float64
github_pages         float64
license               object
name                  object
open_issues          float64
size                 float64
star_count           float64
topics_count         float64
total_issue_count    float64
updated_at            object
url                   object
watchers_count       float64
dtype: object

## Filling Null Values

In [5]:
df.isnull().sum()

allow_forking          0
commit_count           0
contributor_count    155
created_at             0
forks                  0
github_pages           0
license                0
name                   0
open_issues            0
size                   0
star_count             0
topics_count           0
total_issue_count      0
updated_at             0
url                    0
watchers_count         0
dtype: int64

In [6]:
df["contributor_count"] = df["contributor_count"].fillna(df["contributor_count"].mean()) # fill license with mode of the column

In [7]:
for i in range(df.shape[0]):
    if df.loc[i, "total_issue_count"] == 0:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]
    elif df.loc[i, "total_issue_count"] < df.loc[i, "open_issues"]:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]

In [8]:
df["commit_count"] = df["commit_count"].replace(0, df["commit_count"].mean())

In [9]:
df.isnull().sum()

allow_forking        0
commit_count         0
contributor_count    0
created_at           0
forks                0
github_pages         0
license              0
name                 0
open_issues          0
size                 0
star_count           0
topics_count         0
total_issue_count    0
updated_at           0
url                  0
watchers_count       0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count
count,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0
mean,1.0,1584.557561,60.614956,566.492683,0.161463,90.137805,67469.21,2733.256829,5.11878,296.208537,2733.256829
std,0.0,6634.910726,171.497638,612.552046,0.368003,200.877626,1046066.0,2078.212911,5.462667,743.729756,2078.212911
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0
25%,1.0,92.0,7.0,215.75,0.0,12.0,820.0,1325.0,0.0,29.0,1325.0
50%,1.0,328.5,22.0,381.0,0.0,36.0,4793.0,1946.5,4.0,91.0,1946.5
75%,1.0,1086.0,60.614956,667.0,0.0,93.0,27844.25,2994.0,8.0,254.0,2994.0
max,1.0,232549.0,5602.0,6268.0,1.0,4427.0,57016100.0,9998.0,20.0,18576.0,9998.0


## Feature Engineering

### Add how many years the project has been developed

In [11]:
df["created_at"] = pd.to_datetime(df["created_at"])
df["updated_at"] = pd.to_datetime(df["updated_at"])
df["years"] = (df["updated_at"] - df["created_at"]) / np.timedelta64(1, 'Y')
df["years"] = df["years"].astype(np.int32)

### Find the closed issue count, add closed issue rate (closed / all)

In [36]:
df["closed_issue"] = df["total_issue_count"] - df["open_issues"]
df["issue_close_rate"] = df["closed_issue"] / df["total_issue_count"]
df["issue_close_rate"] = df["issue_close_rate"].fillna(0)
df[["closed_issue", "issue_close_rate", "total_issue_count"]]

Unnamed: 0,closed_issue,issue_close_rate,total_issue_count
0,679.0,0.659223,1030.0
1,74.0,0.755102,98.0
2,105.0,0.714286,147.0
3,2091.0,0.760364,2750.0
4,53.0,0.803030,66.0
...,...,...,...
4095,0.0,0.000000,13.0
4096,0.0,0.000000,7.0
4097,0.0,0.000000,6.0
4098,0.0,0.000000,327.0


In [37]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate,star_category
count,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0,4100.0
mean,1.0,1584.557561,60.614956,566.492683,0.161463,90.137805,67469.21,2733.256829,5.11878,296.208537,2733.256829,5.21561,206.070732,0.43845,3.179268
std,0.0,6634.910726,171.497638,612.552046,0.368003,200.877626,1046066.0,2078.212911,5.462667,743.729756,2078.212911,2.954043,612.940528,0.366835,2.098411
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0,0.0,0.0,0.0,1.0
25%,1.0,92.0,7.0,215.75,0.0,12.0,820.0,1325.0,0.0,29.0,1325.0,3.0,0.0,0.0,2.0
50%,1.0,328.5,22.0,381.0,0.0,36.0,4793.0,1946.5,4.0,91.0,1946.5,5.0,29.0,0.486486,2.0
75%,1.0,1086.0,60.614956,667.0,0.0,93.0,27844.25,2994.0,8.0,254.0,2994.0,7.0,155.0,0.781219,3.0
max,1.0,232549.0,5602.0,6268.0,1.0,4427.0,57016100.0,9998.0,20.0,18576.0,9998.0,13.0,16633.0,1.0,10.0


### Mapping Licence to Commercial Use

In [38]:
df["license"].value_counts()

MIT License                                                   1403
Apache License 2.0                                             666
Other                                                          637
Not Found                                                      472
GNU General Public License v3.0                                335
BSD 3-Clause "New" or "Revised" License                        229
GNU General Public License v2.0                                 82
BSD 2-Clause "Simplified" License                               80
GNU Affero General Public License v3.0                          69
GNU Lesser General Public License v3.0                          34
Mozilla Public License 2.0                                      20
Creative Commons Zero v1.0 Universal                            19
The Unlicense                                                   15
Creative Commons Attribution Share Alike 4.0 International       8
GNU Lesser General Public License v2.1                        

All of the licences are free for commercial usage, therefore this is not useful. Will drop `license` column

In [39]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count,years,closed_issue,issue_close_rate,star_category
0,1.0,5524,312.0,2020-05-19 02:37:13+00:00,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09 17:53:54+00:00,https://github.com/ManimCommunity/manim,9951.0,1,679.0,0.659223,10.0
1,1.0,224,16.0,2017-08-25 12:05:15+00:00,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09 09:30:18+00:00,https://github.com/zalandoresearch/fashion-mnist,9998.0,4,74.0,0.755102,10.0
2,1.0,133,3.0,2018-01-09 09:48:49+00:00,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09 14:13:08+00:00,https://github.com/Embedding/Chinese-Word-Vectors,9976.0,4,105.0,0.714286,10.0
3,1.0,6023,18.0,2018-07-11 18:28:58+00:00,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09 15:50:27+00:00,https://github.com/PySimpleGUI/PySimpleGUI,9924.0,3,2091.0,0.760364,10.0
4,1.0,541,9.0,2016-05-29 13:29:44+00:00,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09 11:56:21+00:00,https://github.com/MorvanZhou/tutorials,9944.0,5,53.0,0.80303,10.0


`Watcher Count` is directly related with the star count, therefore we drop that column

## Categorize the star counts

In [40]:
df["star_category"] = df["star_count"].map(lambda x: (x // 1000) + 1)

In [41]:
df["star_category"].value_counts()

2.0     1765
3.0      909
1.0      408
4.0      241
5.0      224
6.0      159
9.0      127
7.0      111
10.0      82
8.0       74
Name: star_category, dtype: int64

In [42]:
X = df.drop(columns=["created_at", "license", "name", "updated_at", "url", "watchers_count", "star_count", "star_category"])
Y = df["star_category"].values

In [43]:
X.shape, Y.shape

((4100, 12), (4100,))

## Preparing Train and Test Data
We will split train to test as 80%, 20&

In [44]:
from sklearn.model_selection import train_test_split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

In [50]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3280, 12), (3280,), (820, 12), (820,))

## Apply Baseline Machine Learning Model

In [51]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(X_train, y_train)
preds = neigh.predict(X_test)

In [60]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

0.41829268292682925
