# Importing Libraries and Packages

In [1]:
# Base
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# EDA
import missingno as msno

# Visualisation
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.tree import plot_tree

# Text-Based Analyses
import gensim.downloader
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.metrics.pairwise import cosine_similarity

# One-hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Voting
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

# Classifiers
from sklearn.tree import DecisionTreeClassifier                                     
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV                
from sklearn.linear_model import SGDClassifier                                      
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors                                      
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC                                                        
from sklearn.ensemble import AdaBoostClassifier 

# Regressors
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Clustering
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import scipy.cluster.hierarchy as sch  

# Scalers and Transformers
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline                                               
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, roc_curve, roc_auc_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy.stats import zscore, boxcox
from sklearn.model_selection import cross_val_score  

In [2]:
# Set Seed
SEED = 50

# Importing Data

In [3]:
# Import data
path = "data/kickstarter_projects.csv"

df_kickstarter = pd.read_csv(path)
df_kickstarter.head(2)

Unnamed: 0,ID,Name,Category,Subcategory,Country,Launched,Deadline,Goal,Pledged,Backers,State
0,1860890148,Grace Jones Does Not Give A F$#% T-Shirt (limi...,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed
1,709707365,CRYSTAL ANTLERS UNTITLED MOVIE,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed


In [None]:
# TO DO: IMPORT csv train and test 

# Training and optimising different models

### Random forest

In [None]:
# Create new RFC with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params, random_state=SEED)

In [None]:
# Train
best_rf_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = best_rf_model.predict(X_test_scaled)

In [None]:
# Evaluate on the test set
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, best_rf_model.predict_proba(X_test_scaled)[:, 1])}")

#### Feature Importance 

In [None]:
# Get feature importances
importances = best_rf_model.feature_importances_
print(importances)

feature_names = X_test.columns  # To get the names of the features
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


# Visualisation: Feature Importance (top20)
top_n = 20
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance_df.head(top_n))
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title(f"Top {top_n} Feature Importances")
plt.tight_layout()
plt.show()

### XGBoost

Best hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 1.0}

In [None]:
# Create a new XGBoost model with the best hyperparameters
best_xgb_model = xgb.XGBClassifier(**grid_search.best_params_, use_label_encoder=False, random_state=SEED, eval_metric='logloss')

In [None]:
# Train the best parameter XGBoost model
best_xgb_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions with best parameters
y_pred = best_xgb_model.predict(X_test_scaled)
y_prob = best_xgb_model.predict_proba(X_test_scaled)[:, 1] 

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob)}")

#### Feature Importance

In [None]:
# Get feature importances
importances_xgb = best_xgb_model.feature_importances_
print(importances_xgb)

In [None]:
# Get Column Names
feature_names = X_test.columns  # Assuming you have column names in X_train_scaled
feature_importance_df_xgb = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances_xgb
})
feature_importance_df_xgb = feature_importance_df_xgb.sort_values(by='Importance', ascending=False)
print(feature_importance_df_xgb)

In [None]:
# Visualisation: Feature Importance (top20)
top_n = 20
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance_df_xgb.head(top_n))
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title(f"Top {top_n} Feature Importances")
plt.tight_layout()
plt.show()

### SVM

In [None]:
# Initialize the SVM model with best Hyperparameters
svm_model = SVC(random_state=SEED, C=0.1, kernel='rbf', gamma=1)

In [None]:
# Train
svm_model.fit(X_train_scaled, y_train)

In [None]:
# Test
y_pred = svm_model.predict(X_test_scaled)
y_prob = svm_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob)}")

#### Feature Importance

In [None]:
# Get feature importances
importances = svm_model.feature_importances_
print(importances)

feature_names = X_test.columns  # To get the names of the features
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


# Visualisation: Feature Importance (top20)
top_n = 20
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance_df.head(top_n))
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title(f"Top {top_n} Feature Importances")
plt.tight_layout()
plt.show()