# Creating model to predict Social Media Success

## 1) Prepare data

In [21]:
#import functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [22]:
data = pd.read_csv('TikTokSpotifyMerged.csv')

In [23]:
# Transform the data to prepare it for usage in a machine learning model

# Clean duplicates based on track_id and popularity
#data = tik.drop_duplicates(subset=['track_id', 'popularity'], keep='first')

# Drop unnecessary columns
data = data.drop(['track_id',"target", "popularity", "target", "sections", "chorus_hit", "time_signature" ], axis=1)
data = data.drop(['tiktok', 'artist', 'spotify', "track"], axis=1)
data.info()

data = pd.get_dummies(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43808 entries, 0 to 43807
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sm_target          43808 non-null  float64
 1   duration_ms        43808 non-null  float64
 2   danceability       43808 non-null  float64
 3   energy             43808 non-null  float64
 4   key                43808 non-null  object 
 5   loudness           43808 non-null  float64
 6   mode               43808 non-null  object 
 7   speechiness        43808 non-null  float64
 8   acousticness       43808 non-null  float64
 9   instrumentalness   43808 non-null  float64
 10  liveness           43808 non-null  float64
 11  valence            43808 non-null  float64
 12  tempo              43808 non-null  float64
 13  era                43808 non-null  object 
 14  main_parent_genre  43808 non-null  object 
dtypes: float64(11), object(4)
memory usage: 5.0+ MB


## 2) Run Random Forest Classifier

### 2.1) Without Optimization

In [24]:
# Split your dataframe into features (X) and target variable (y)
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Standardize your features because some models perform better with standardized data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define the classifier and fit it to your data
randomforest = RandomForestClassifier(n_estimators=100, random_state=42)
randomforest.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [25]:
# Predict the labels for the test set
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = randomforest.predict(X_test)

# Evaluate the classifier
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9587613178117629
Confusion Matrix:
 [[12300   110]
 [  432   301]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     12410
         1.0       0.73      0.41      0.53       733

    accuracy                           0.96     13143
   macro avg       0.85      0.70      0.75     13143
weighted avg       0.95      0.96      0.95     13143



### 2.2) With Optimization

In [26]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [400, 600, 800],
    'max_depth': [None],
    'min_samples_split': [11, 12, 14],
    'min_samples_leaf': [1]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=randomforest, param_grid=param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Use the best parameters to create a new random forest classifier
best_randomforest = RandomForestClassifier(**best_params, class_weight='balanced')
best_randomforest.fit(X_train, y_train)

Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 14, 'n_estimators': 600}
Best score:  0.956367193869232


RandomForestClassifier(class_weight='balanced', min_samples_split=14,
                       n_estimators=600)

In [27]:
# Evaluate the performance on the test set
# y_pred_best = best_randomforest.predict(X_test)

# Get predicted probabilities for positive class
y_pred_proba = best_randomforest.predict_proba(X_test)[:, 1]

# Set custom threshold
custom_threshold = 0.4

# Generate predictions based on custom threshold
y_pred_custom = (y_pred_proba > custom_threshold).astype(int)

print('Accuracy:', accuracy_score(y_test, y_pred_custom))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_custom))
print('Classification Report:\n', classification_report(y_test, y_pred_custom))

Accuracy: 0.9415658525450811
Confusion Matrix:
 [[11849   561]
 [  207   526]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.95      0.97     12410
         1.0       0.48      0.72      0.58       733

    accuracy                           0.94     13143
   macro avg       0.73      0.84      0.77     13143
weighted avg       0.96      0.94      0.95     13143



In [28]:
import pickle

# Save the trained random forest classifier to a file
filename = 'SM_model.pkl'
pickle.dump(best_randomforest, open(filename, 'wb'))

## 3) Run Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     20667
         1.0       0.00      0.00      0.00      1237

    accuracy                           0.94     21904
   macro avg       0.47      0.50      0.49     21904
weighted avg       0.89      0.94      0.92     21904



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 4) Run Decision Tree Classifier

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [32]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = dt.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97     12410
         1.0       0.46      0.47      0.47       733

    accuracy                           0.94     13143
   macro avg       0.71      0.72      0.72     13143
weighted avg       0.94      0.94      0.94     13143



## 5) Run XGB Classifier

In [33]:
from xgboost import XGBClassifier

In [34]:
X = data.drop('sm_target', axis=1) # assuming all other columns are features
y = data['sm_target']

# Split your data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = xgb.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     12410
         1.0       0.66      0.42      0.51       733

    accuracy                           0.96     13143
   macro avg       0.81      0.70      0.74     13143
weighted avg       0.95      0.96      0.95     13143

