In [43]:
import csv
import pandas as pd
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import re
import seaborn as sns
import scipy.stats as stats
import pytz

# Загрузка и подготовка данных

In [44]:
with open("by_repo_results.csv", "r", encoding='utf-8') as f:
  csvreader = csv.reader(f, delimiter=";")
  headers = next(csvreader)
  data = []
  for row in csvreader:
    if row[0].strip(): 
        company_name = row[0]
        company_description = row[1]
        data.append(row)
    else:
        row[0] = company_name
        row[1] = company_description
        data.append(row)

repo_df = pd.DataFrame(data=data, columns=headers).dropna().reset_index().drop('index', axis=1)

org_df = pd.read_csv("by_org_results.csv", delimiter=";").dropna().reset_index().drop('index', axis=1)

In [45]:
repo_df['repo_stars'] = pd.to_numeric(repo_df['repo_stars'], errors='coerce').fillna(0).astype(int)
repo_df['repo_half_year_commits'] = pd.to_numeric(repo_df['repo_half_year_commits'], errors='coerce').fillna(0).astype(int)

# преобразуем даты в datetime с временной зоной UTC
repo_df['repo_last_commit_date'] = pd.to_datetime(
    repo_df['repo_last_commit_date'],
    format='%Y-%m-%dT%H:%M:%SZ',
    errors='coerce',
    utc=True
)

org_df['stars_count'] = org_df['stars_count'].astype(np.int32)

# Создание новых признаков

#### 1. Признаки, извлеченные из описаний и метаданных репозиториев

In [46]:
# признак description_length - длина описания компании
repo_df['description_length'] = repo_df['description'].fillna('').apply(len)

# признак has_description - наличие описания
repo_df.loc[:, 'has_description'] = (repo_df['description_length'] > 0).astype(int)

# признак num_topics - количество тэгов
repo_df['num_topics'] = repo_df['repo_topics'].apply(lambda x: len(eval(x)) if pd.notnull(x) and isinstance(x, str) and x.startswith("[") else 0)

# признак num_languages - количество используемых технологий
def count_languages(lang_string):
    if pd.isna(lang_string):
        return 0
    try:
        lang_string = lang_string.strip("()") 
        parts = lang_string.split(",")
        return len([p for p in parts if ":" in p])
    except:
        return 0

repo_df['num_languages'] = repo_df['repo_languages'].apply(count_languages)


#### 2. Признаки, основанные на активности

In [47]:
# признак days_since_last_commit - количество дней с послнеднего коммита
current_date = datetime.now(pytz.utc)

def calculate_days_since_last_commit(date):
    if pd.isna(date): 
        return None  
    return (current_date - date).days

repo_df['days_since_last_commit'] = repo_df['repo_last_commit_date'].apply(calculate_days_since_last_commit)
repo_df['days_since_last_commit'] = pd.to_numeric(repo_df['days_since_last_commit'], errors='coerce').fillna(0).astype(int)

# признак avg_commits_per_month - средняя частота коммитов в пересчете на месяц
repo_df.loc[:, 'avg_commits_per_month'] = repo_df['repo_half_year_commits'] / 6

# признак is_active - бинарный индикатор, показывающий, был ли активен проект за последние шесть месяцев
repo_df.loc[:, 'is_active'] = (
    (repo_df['repo_half_year_commits'] > 10) &
    (repo_df['days_since_last_commit'] < 180)
).astype(int)

#### 3. Признаки, основанные на организации

In [48]:
# org_avg_stars - среднее число звезд на проект у организации
# org_total_stars - общее количество звезд у организации
# org_total_projects - общее количество проектов 
# org_avg_commits - общее количество коммитов

# группируем репозиторные данные по организации для создания признаков
org_features = repo_df.groupby('organization').agg({
    'repo_stars': ['mean', 'sum', 'count'],
    'repo_half_year_commits': lambda x: pd.to_numeric(x, errors='coerce').fillna(0).mean()
}).reset_index()

org_features.columns = ['organization', 'org_avg_stars', 'org_total_stars', 
                        'org_total_projects', 'org_avg_commits']

repo_df = repo_df.merge(org_features, on='organization', how='left')

#### 4. Признаки взаимодействия

In [49]:
# признак recent_commit_score - интенсивность разработки
repo_df.loc[:, 'recent_commit_score'] = 1 / (1 + repo_df['days_since_last_commit'])

# признак tag_density - плотность тэгов
repo_df.loc[:, 'tag_density'] = repo_df['num_topics'] / (repo_df['description_length'] + 1)

# признак activity_score - оценка активности проекта
repo_df['activity_score'] = repo_df['avg_commits_per_month'] * repo_df['recent_commit_score']

# признак description_density - плотность описания
repo_df['description_density'] = repo_df['description_length'] / (1 + repo_df['num_languages'])

# признак topics_per_language - интенсивность тематической маркировки на каждый язык
repo_df['topics_per_language'] = repo_df['num_topics'] / (1 + repo_df['num_languages'])

In [50]:
# удаление twitter_link, так как он не несет никакой информации
repo_df = repo_df.drop('twitter_link', axis=1)
repo_df.head(5)

Unnamed: 0,organization,description,repo_name,repo_stars,repo_topics,repo_languages,repo_half_year_commits,repo_last_commit_date,description_length,has_description,...,is_active,org_avg_stars,org_total_stars,org_total_projects,org_avg_commits,recent_commit_score,tag_density,activity_score,description_density,topics_per_language
0,ros-acceleration,"Drive creation, maintenance and testing of har...",community,61,"['cpu', 'fpga', 'gpu', 'hardware', 'hardware-a...","('%',)",0,2024-01-04 09:54:23+00:00,86,1,...,0,15.65,313,20,0.1,0.002037,0.08046,0.0,86.0,7.0
1,ros-acceleration,"Drive creation, maintenance and testing of har...",ament_vitis,8,[],"('CMake: 100.0%',)",0,2023-08-14 09:36:49+00:00,86,1,...,0,15.65,313,20,0.1,0.001577,0.0,0.0,43.0,0.0
2,ros-acceleration,"Drive creation, maintenance and testing of har...",acceleration_firmware_kv260,10,"['acceleration', 'fpga', 'hardware', 'hardware...","('CMake: 100.0%',)",0,2022-11-09 11:32:38+00:00,86,1,...,0,15.65,313,20,0.1,0.001096,0.103448,0.0,43.0,4.5
3,ros-acceleration,"Drive creation, maintenance and testing of har...",acceleration_firmware,0,[],"('CMake: 100.0%',)",0,2021-09-09 08:17:35+00:00,86,1,...,0,15.65,313,20,0.1,0.000747,0.0,0.0,43.0,0.0
4,ros-acceleration,"Drive creation, maintenance and testing of har...",acceleration_examples,42,"['fpga', 'gpu', 'hardware-acceleration', 'ros2']","('C++: 49.51012602479045%, Python: 39.08205153...",2,2024-12-13 08:03:32+00:00,86,1,...,0,15.65,313,20,0.1,0.006803,0.045977,0.002268,14.333333,0.666667


In [52]:
# категориальный признак по квантилям (4 класса)
repo_df['stars_class'] = pd.cut(
    repo_df['repo_stars'],
    bins=[-1, 0, 10, 100, float('inf')],  # 4 интервала
    labels=['low', 'medium', 'high', 'very_high']
)

target = repo_df['stars_class'].value_counts()
target


stars_class
low          8173
medium       3928
high         1788
very_high     693
Name: count, dtype: int64