In [None]:
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [None]:
# Transform the data to prepare it for usage in a machine learning model

# Clean duplicates based on track_id and popularity
#data = tik.drop_duplicates(subset=['track_id', 'popularity'], keep='first')

# Drop unnecessary columns
data = data.drop(['track_id',"target", "popularity", "target", "sections", "chorus_hit", "time_signature" ], axis=1)
data = data.drop(['tiktok', 'artist', 'spotify', "track"], axis=1)
data.info()

data = pd.get_dummies(data)

In [None]:
# Split your dataframe into features (X) and target variable (y)
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Standardize your features because some models perform better with standardized data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define the classifier and fit it to your data
randomforest = RandomForestClassifier(n_estimators=100, random_state=42)
randomforest.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = randomforest.predict(X_test)

# Evaluate the classifier
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [None],
    'min_samples_split': [8, 10, 12],
    'min_samples_leaf': [1]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=randomforest, param_grid=param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Use the best parameters to create a new random forest classifier
best_randomforest = RandomForestClassifier(**best_params, class_weight='balanced')
best_randomforest.fit(X_train, y_train)

In [None]:
# Evaluate the performance on the test set
# y_pred_best = best_randomforest.predict(X_test)

# Get predicted probabilities for positive class
y_pred_proba = best_randomforest.predict_proba(X_test)[:, 1]

# Set custom threshold
custom_threshold = 0.4

# Generate predictions based on custom threshold
y_pred_custom = (y_pred_proba > custom_threshold).astype(int)

print('Accuracy:', accuracy_score(y_test, y_pred_custom))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_custom))
print('Classification Report:\n', classification_report(y_test, y_pred_custom))

In [None]:
import pickle

# Save the trained random forest classifier to a file
filename = 'randomforest_model.pkl'
pickle.dump(best_randomforest, open(filename, 'wb'))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = dt.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

In [None]:
from xgboost import XGBClassifier

In [None]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = xgb.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))