In [184]:
import csv
import pandas as pd
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import re
import seaborn as sns
import scipy.stats as stats
import pytz

# Загрузка и подготовка данных

In [185]:
with open("by_repo_results.csv", "r", encoding='utf-8') as f:
  csvreader = csv.reader(f, delimiter=";")
  headers = next(csvreader)
  data = []
  for row in csvreader:
    if row[0].strip(): 
        company_name = row[0]
        company_description = row[1]
        data.append(row)
    else:
        row[0] = company_name
        row[1] = company_description
        data.append(row)

repo_df = pd.DataFrame(data=data, columns=headers).dropna().reset_index().drop('index', axis=1)

org_df = pd.read_csv("by_org_results.csv", delimiter=";").dropna().reset_index().drop('index', axis=1)

In [186]:
repo_df['repo_stars'] = pd.to_numeric(repo_df['repo_stars'], errors='coerce').fillna(0).astype(int)
repo_df['repo_half_year_commits'] = pd.to_numeric(repo_df['repo_half_year_commits'], errors='coerce').fillna(0).astype(int)

# преобразуем даты в datetime с временной зоной UTC
repo_df['repo_last_commit_date'] = pd.to_datetime(
    repo_df['repo_last_commit_date'],
    format='%Y-%m-%dT%H:%M:%SZ',
    errors='coerce',
    utc=True
)

org_df['stars_count'] = org_df['stars_count'].astype(np.int32)

today = pd.Timestamp(datetime.now(), tz=pytz.UTC)
repo_df['days_since_last_commit'] = (today - repo_df['repo_last_commit_date']).dt.days

# Создание новых признаков

### Группировка по организациям

In [187]:
grouped = repo_df.groupby("organization").agg(
    total_repos=("repo_name", "count"),
    mean_repo_stars=("repo_stars", "mean"),
    stars_count=("repo_stars", "sum"),
    active_repos=("repo_half_year_commits", lambda x: (x > 0).sum()),
    mean_half_year_commits=("repo_half_year_commits", "mean"),
    last_commit_days_ago_mean=("days_since_last_commit", "mean"),
    is_active=("days_since_last_commit", lambda x: (x < 90).any()),
    num_topics=("repo_topics", lambda x: np.mean([len(eval(i)) if isinstance(i, str) else 0 for i in x])),
    tag_density=("repo_topics", lambda x: np.sum([len(eval(i)) if isinstance(i, str) else 0 for i in x]) / max(len(x), 1)),
    topics_total=("repo_topics", lambda x: np.sum([len(eval(i)) if isinstance(i, str) else 0 for i in x]))
).reset_index()


### Извлечение языков

In [188]:
def extract_languages(lang_str):
    try:
        langs = eval(lang_str)
        return [l.split(':')[0].strip() for l in langs]
    except:
        return []

repo_df['parsed_languages'] = repo_df['repo_languages'].apply(extract_languages)

lang_df = repo_df.explode("parsed_languages").groupby("organization")['parsed_languages'].agg(lambda x: set(x)).reset_index()
lang_df['num_languages'] = lang_df['parsed_languages'].apply(len)

### Объединение

In [189]:
features_df = org_df.merge(grouped, on="organization", how="left")
features_df = features_df.merge(lang_df, on="organization", how="left")

features_df = features_df.drop('stars_count_x', axis=1)
features_df = features_df.rename(columns={'stars_count_y': 'stars_count'})

### Признаки на основе описания

In [190]:
descriptions = repo_df.groupby('organization')['description'].first().reset_index()
features_df = features_df.merge(descriptions, on='organization', how='left')

features_df['description_length'] = features_df['description'].apply(lambda x: len(str(x)))
features_df['has_description'] = features_df['description'].apply(lambda x: int(isinstance(x, str) and x.strip() != ''))


### Дней с последнего коммита

In [191]:
min_days_df = repo_df.groupby('organization')['days_since_last_commit'].min().reset_index()
features_df = features_df.merge(min_days_df, on='organization', how='left')

### Сводные признаки

In [192]:
features_df['avg_commits_per_month'] = features_df['mean_half_year_commits'] / 6
features_df['recent_commit_score'] = 1 / (features_df['days_since_last_commit'] + 1)
features_df['activity_score'] = features_df['avg_commits_per_month'] * features_df['recent_commit_score']
features_df['description_density'] = features_df['description_length'] / (features_df['num_languages'] + 1)
features_df['topics_per_language'] = features_df['topics_total'] / (features_df['num_languages'] + 1)

features_df.head(4)

Unnamed: 0,organization,top 3 repo,languages,ros_repos,name_parts,total_repos,mean_repo_stars,stars_count,active_repos,mean_half_year_commits,...,num_languages,description,description_length,has_description,days_since_last_commit,avg_commits_per_month,recent_commit_score,activity_score,description_density,topics_per_language
0,ros-acceleration,"['robotic_processing_unit', 'community', 'acce...","['Cuda', 'Python', 'C++', 'Tcl', 'C', 'CMake']","['ros2_kria', 'isaac_ros_image_pipeline']","['community', 'ament', 'vitis', 'acceleration'...",20.0,15.65,313.0,1.0,0.1,...,5.0,"Drive creation, maintenance and testing of har...",86,1,150.0,0.016667,0.006623,0.00011,14.333333,15.0
1,Aceinna,"['gnss-ins-sim', 'python-openimu', 'platform-a...","['JavaScript', 'Shell', 'MATLAB', 'M', 'Python...",['aceinna_openrtk_ros_driver'],"['platform', 'aceinna_imu', 'python', 'imu380'...",33.0,45.272727,1494.0,2.0,0.121212,...,6.0,"Open-source Inertial Navigation, GPS/INS, AHRS...",106,1,166.0,0.020202,0.005988,0.000121,15.142857,1.0
2,Accenture,"['adop-docker-compose', 'Ocaramba', 'adop-jenk...","['PowerShell', 'JavaScript', 'Thrift', 'R', 'C...",[],"['mahout', 'timeseriesr', 'environment', 'dash...",211.0,42.890995,9050.0,32.0,3.530806,...,31.0,Accenture Github site,21,1,81.0,0.588468,0.012195,0.007176,0.65625,8.53125
3,accerion,"['gazebo_ros_2d_map_plugin', 'librover', 'pepp...","['Shell', 'Julia', 'Python', 'C++', 'CMake', '...",['gazebo_ros_2d_map_plugin'],"['gazebo', 'ros', '2d', 'map', 'plugin', 'libr...",5.0,0.0,0.0,1.0,0.4,...,3.0,,0,0,151.0,0.066667,0.006579,0.000439,0.0,0.0


### Очистка и сохранение

In [193]:
features_df.dropna().reset_index().drop('index', axis=1).to_csv("features_for_classification.csv", index=False)