In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import csv
from datetime import datetime
import pytz
import re

In [None]:
with open("by_repo_results (3).csv", "r", encoding='utf-8') as f:
  csvreader = csv.reader(f, delimiter=";")
  headers = next(csvreader)
  data = []
  for row in csvreader:
    if row[0] != " ":
      company_name = row[0]
    else:
      # if company_name in allowed_orgs:
        # print(row)
        row[0] = company_name
        data.append(row)

repo_data = pd.DataFrame(data=data, columns=headers)
repo_data["repo_stars"] = repo_data["repo_stars"].astype(np.int32)

In [40]:
repo_data['repo_stars'] = pd.to_numeric(repo_data['repo_stars'], errors='coerce').fillna(0)
repo_data['repo_half_year_commits'] = pd.to_numeric(repo_data['repo_half_year_commits'], errors='coerce')

#создание признака days_since_last_commit
repo_data['repo_last_commit_date'] = pd.to_datetime(repo_data['repo_last_commit_date'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')

current_date = datetime.now(pytz.utc)
def calculate_days_since_last_commit(date):
    if pd.isna(date): 
        return None  
    return (current_date - date).days

repo_data['days_since_last_commit'] = repo_data['repo_last_commit_date'].apply(calculate_days_since_last_commit)

repo_data['days_since_last_commit'].head(10)

0     346.0
1     489.0
2     767.0
3    1193.0
4     417.0
5     992.0
6    1005.0
7     816.0
8     991.0
9     795.0
Name: days_since_last_commit, dtype: float64

In [None]:
#преобразование категориального признака в числовой

def extract_main_language(language_str):
    if language_str and isinstance(language_str, str):
        match = re.match(r"\('?(.*?)[:s]", language_str)
        if match:
            return match.group(1)
    return "Unknown"

repo_data["leading_language"] = repo_data["repo_languages"].apply(lambda x: extract_main_language(x))

label_encoder = LabelEncoder()

repo_data['leading_language_encoded'] = label_encoder.fit_transform(repo_data['leading_language'])

repo_data['leading_language_encoded'].head(10)

0    85
1    11
2    11
3    11
4    10
5    65
6    85
7    10
8     8
9    11
Name: leading_language_encoded, dtype: int64

In [68]:
#создание целевого признака

stars_threshold = repo_data['repo_stars'].quantile(0.85)
print(stars_threshold)

repo_data['is_top_project'] = (repo_data['repo_stars'] >= stars_threshold).astype(int)
repo_data['is_top_project'].value_counts()

repo_data[repo_data['repo_stars'] != 0]

10.0


Unnamed: 0,organization,description,twitter_link,repo_name,repo_stars,repo_topics,repo_languages,repo_half_year_commits,repo_last_commit_date,days_since_last_commit,is_top_project,leading_language,leading_language_encoded,feature
0,ros-acceleration,,,community,60,"['cpu', 'fpga', 'gpu', 'hardware', 'hardware-a...","('%',)",-0.211798,2024-01-04 09:54:23+00:00,-1.023531,1,Unknown,1.673542,1.673542
1,ros-acceleration,,,ament_vitis,7,[],"('CMake: 100.0%',)",-0.211798,2023-08-14 09:36:49+00:00,-0.891849,0,CMake,-0.859083,-0.859083
2,ros-acceleration,,,acceleration_firmware_kv260,10,"['acceleration', 'fpga', 'hardware', 'hardware...","('CMake: 100.0%',)",-0.211798,2022-11-09 11:32:38+00:00,-0.635853,1,CMake,-0.859083,-0.859083
4,ros-acceleration,,,acceleration_examples,41,"['fpga', 'gpu', 'hardware-acceleration', 'ros2']","('C++: 48.70051274748906%, Python: 40.05005593...",-0.211798,2023-10-25 12:24:05+00:00,-0.958151,1,C++,-0.893308,-0.893308
10,ros-acceleration,,,acceleration_firmware_ultra96v2,5,"['acceleration', 'fpga', 'hardware', 'hardware...","('%',)",-0.211798,2022-06-20 23:20:33+00:00,-0.506014,0,Unknown,1.673542,1.673542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8507,4am-robotics,,,cob_driver,102,[],"('C++: 84.66987225943875%, Python: 8.680938486...",0.088712,2024-08-05 07:47:01+00:00,-1.220593,1,C++,-0.893308,-0.893308
8508,4am-robotics,,,cob_command_tools,37,[],"('Python: 85.61973263981457%, C++: 12.40798272...",0.238967,2024-10-02 16:10:28+00:00,-1.274002,1,Python,0.989048,0.989048
8512,4am-robotics,,,cob_control,35,[],"('C++: 76.43477604048644%, Python: 19.96449882...",0.088712,2024-08-05 07:43:34+00:00,-1.220593,1,C++,-0.893308,-0.893308
8514,4am-robotics,,,pointcloud_to_laserscan,3,[],"('C++: 96.39305870157791%, CMake: 3.6069412984...",0.088712,2024-08-05 08:01:31+00:00,-1.220593,0,C++,-0.893308,-0.893308


In [None]:
#выбор признаков для классификации
features = [
    'repo_half_year_commits',
    'days_since_last_commit',
    'leading_language_encoded'
]

#удаление строк с пропусками
repo_data = repo_data.dropna(subset=features)

#нормализация числовых признаков
for feature in features:
    repo_data.loc[:, 'feature'] = (repo_data[feature] - repo_data[feature].mean()) / repo_data[feature].std()

In [None]:
X = repo_data[features]
y = repo_data['is_top_project']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
#оценка модели
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88      2163
           1       0.31      0.26      0.28       387

    accuracy                           0.80      2550
   macro avg       0.59      0.58      0.58      2550
weighted avg       0.79      0.80      0.79      2550

Confusion Matrix:
 [[1944  219]
 [ 288   99]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine": SVC(random_state=42, probability=True),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

#обучение и оценка
for model_name, model in models.items():
    print(f"\n--- {model_name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      2380
           1       0.49      0.04      0.07       425

    accuracy                           0.85      2805
   macro avg       0.67      0.52      0.50      2805
weighted avg       0.80      0.85      0.79      2805

Confusion Matrix:
[[2362   18]
 [ 408   17]]

--- Support Vector Machine ---
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      2380
           1       0.56      0.04      0.08       425

    accuracy                           0.85      2805
   macro avg       0.71      0.52      0.50      2805
weighted avg       0.81      0.85      0.79      2805

Confusion Matrix:
[[2365   15]
 [ 406   19]]

--- Random Forest ---
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      2380
           1       0.32      0.28      0.30       425

    accu

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

#параметры для Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],        # Количество деревьев
    'max_depth': [None, 10, 20, 30],        # Максимальная глубина дерева
    'min_samples_split': [2, 5, 10],        # Минимальное количество образцов для разделения
    'min_samples_leaf': [1, 2, 4],          # Минимальное количество образцов в листе
    'class_weight': [None, 'balanced']      # Вес классов
}

#параметры для Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],        # Количество деревьев
    'learning_rate': [0.01, 0.1, 0.2],      # Темп обучения
    'max_depth': [3, 5, 7],                 # Максимальная глубина дерева
    'min_samples_split': [2, 5, 10],        # Минимальное количество образцов для разделения
    'min_samples_leaf': [1, 2, 4]           # Минимальное количество образцов в листе
}

#Random Forest
rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid,
    scoring='f1_macro',  # Метрика оценки
    cv=3,                # Количество фолдов в кросс-валидации
    n_jobs=-1,           # Использование всех процессоров
    verbose=2            # Вывод процесса
)

#Gradient Boosting
gb_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=gb_param_grid,
    scoring='f1_macro',  # Метрика оценки
    cv=3,                # Количество фолдов в кросс-валидации
    n_jobs=-1,           # Использование всех процессоров
    verbose=2            # Вывод процесса
)

rf_grid_search.fit(X_train, y_train)
print("Лучшие параметры для Random Forest:", rf_grid_search.best_params_)

rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)
print("--- Random Forest ---")
print(classification_report(y_test, rf_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_y_pred))

gb_grid_search.fit(X_train, y_train)
print("Лучшие параметры для Gradient Boosting:", gb_grid_search.best_params_)

gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)
print("--- Gradient Boosting ---")
print(classification_report(y_test, gb_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, gb_y_pred))


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Лучшие параметры для Random Forest: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
--- Random Forest ---
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      2380
           1       0.30      0.40      0.35       425

    accuracy                           0.77      2805
   macro avg       0.59      0.62      0.60      2805
weighted avg       0.80      0.77      0.78      2805

Confusion Matrix:
[[1986  394]
 [ 254  171]]
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Лучшие параметры для Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
--- Gradient Boosting ---
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      2380
           1       0.48      0.20      0.28   