In [21]:
## RANDOM FOREST CLASSIFICATION
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load training and testing data
train_data = pd.read_csv("combined_train.csv", usecols=['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5', 'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4', 'status'])
test_data = pd.read_csv("combined_test.csv", usecols=['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5', 'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4', 'status'])

# Remove rows with "Issues missing", "Commits missing", or "PRs missing"
train_data = train_data[(train_data != "Issues missing").all(axis=1) & (train_data != "Commits missing").all(axis=1) & (train_data != "PRs missing").all(axis=1)]
test_data = test_data[(test_data != "Issues missing").all(axis=1) & (test_data != "Commits missing").all(axis=1) & (test_data != "PRs missing").all(axis=1)]

# Preprocessing
X_train = train_data.drop(columns=['status'])
y_train = train_data['status']

X_test = test_data.drop(columns=['status'])
y_test = test_data['status']

# Oversampling the minority class
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Model Training
model = RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, you can also print feature importances
if hasattr(model, 'feature_importances_'):
    print("\nFeature Importances:")
    for feature, importance in zip(X_train.columns, model.feature_importances_):
        print(feature, ":", importance)

# Prediction (if needed)
# You can now use the trained model to make predictions on new data
# For example:
# new_data = pd.read_csv("new_data.csv")
# predictions = model.predict(new_data)


Accuracy: 0.972972972972973
Classification Report:
              precision    recall  f1-score   support

   graduated       0.97      1.00      0.99        35
     retired       1.00      0.50      0.67         2

    accuracy                           0.97        37
   macro avg       0.99      0.75      0.83        37
weighted avg       0.97      0.97      0.97        37


Feature Importances:
COM-1 : 0.009728534985065882
COM-2 : 0.08771286820972525
POP-1 : 0.04267216428054325
STA-1 : 0.026882121822406332
STA-2 : 0.03419295207857067
STA-3 : 0.09223115346902334
STA-4 : 0.005247974786370183
STA-5 : 0.0003006891526066756
STA-6 : 0.0011960319148888808
STA-7 : 0.025724177204980058
STA-8 : 0.15719533642913824
STA-9 : 0.34727465670585267
TEC-1 : 0.00784465296777163
TEC-2 : 0.005148527818857867
TEC-3 : 0.09704397312256476
TEC-4 : 0.059604185051634416


In [28]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load training and testing data
train_data = pd.read_csv("combined_train.csv", usecols=['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5', 'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4', 'status'])
test_data = pd.read_csv("combined_test.csv", usecols=['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5', 'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4', 'status'])

# Remove rows with "Issues missing", "Commits missing", or "PRs missing"
train_data = train_data[(train_data != "Issues missing").all(axis=1) & (train_data != "Commits missing").all(axis=1) & (train_data != "PRs missing").all(axis=1)]
test_data = test_data[(test_data != "Issues missing").all(axis=1) & (test_data != "Commits missing").all(axis=1) & (test_data != "PRs missing").all(axis=1)]

# Preprocessing
X_train = train_data.drop(columns=['status'])
y_train = train_data['status']

X_test = test_data.drop(columns=['status'])
y_test = test_data['status']

# Oversampling the minority class
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Calculate the square root of the number of training records
k = int(np.sqrt(len(X_train_resampled)))

# Model Training
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train_resampled, y_train_resampled)

model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train_resampled, y_train_resampled)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Prediction (if needed)
# You can now use the trained model to make predictions on new data
# For example:
# new_data = pd.read_csv("new_data.csv")
# predictions = model.predict(new_data)



Accuracy: 0.7567567567567568
Classification Report:
              precision    recall  f1-score   support

   graduated       0.93      0.80      0.86        35
     retired       0.00      0.00      0.00         2

    accuracy                           0.76        37
   macro avg       0.47      0.40      0.43        37
weighted avg       0.88      0.76      0.81        37

