In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PolynomialFeatures
import pandas as pd
import re

# Load the training data
df = pd.read_csv('train.csv')

# Preprocess 'Total Assets' and 'Liabilities' columns
df['Total Assets'] = df['Total Assets'].apply(lambda x: float(re.sub('[^0-9.]', '', str(x))))
df['Liabilities'] = df['Liabilities'].apply(lambda x: float(re.sub('[^0-9.]', '', str(x))))

# Encode 'Party' column
label_encoder_party = LabelEncoder()
df['Party'] = label_encoder_party.fit_transform(df['Party'])

# Create a dictionary to map states to integers
state_to_int = {state: idx for idx, state in enumerate(df['state'].unique())}

# Replace states with integers
df['State_Label'] = df['state'].map(state_to_int)

# Define features and target variable
X = df[['Criminal Case', 'Total Assets', 'Liabilities', 'Party', 'State_Label']]
y = df['Education']

# Scale the features between 0 and 1
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Create interaction terms up to the second degree
poly = PolynomialFeatures(2, interaction_only=True)
X_interact = poly.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train_interact, X_test_interact, y_train, y_test = train_test_split(X_interact, y, test_size=0.2, random_state=48)

# Initialize the Random Forest Classifier with balanced class weights
rf_classifier_interact = RandomForestClassifier(n_estimators=500, max_depth=10, class_weight='balanced', random_state=44)

# Train the Random Forest Classifier on data with interaction terms
rf_classifier_interact.fit(X_train_interact, y_train)

# Predict on the test set with interaction terms
y_pred_interact = rf_classifier_interact.predict(X_test_interact)

# Calculate F1 score on data with interaction terms
f1_interact = f1_score(y_test, y_pred_interact, average='weighted')
print("F1 Score with interaction terms:", f1_interact)

# Load the test data
test_data = pd.read_csv('test.csv')

# Preprocess the test data
test_data['Total Assets'] = test_data['Total Assets'].apply(lambda x: float(re.sub('[^0-9.]', '', str(x))))
test_data['Liabilities'] = test_data['Liabilities'].apply(lambda x: float(re.sub('[^0-9.]', '', str(x))))

# Encode 'Party' column in test data
test_data['Party'] = label_encoder_party.transform(test_data['Party'])

# Replace states in test data with integers using the same dictionary
test_data['State_Label'] = test_data['state'].map(state_to_int)

# Scale the test data
test_data_scaled = scaler.transform(test_data[['Criminal Case', 'Total Assets', 'Liabilities', 'Party', 'State_Label']])

# Create interaction terms for the test data
test_data_interact = poly.transform(test_data_scaled)

# Make predictions on the test data with interaction terms
predicted_education_levels_interact = rf_classifier_interact.predict(test_data_interact)

# Create a new DataFrame for the predictions
predicted_df_interact = pd.DataFrame({'ID': test_data['ID'], 'Predicted_Education': predicted_education_levels_interact})



F1 Score with interaction terms: 0.23497446386597046
