<a href="https://colab.research.google.com/github/nvdraper04/nvdraper04.github.io/blob/main/Copy_of_User_Copy_of_Social_Media_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Instructions:**
**Here is the link to the data needed for the model:** https://docs.google.com/spreadsheets/d/1t9lz4LYOMkNrJvGVRj-JNHD7XotbuiOdUpVUh4PoDEI/edit?usp=sharing


After the the Google Sheet is added to your Google Drive, you can hit the "Run all" button above and it will run all the code for you. At the end of the run it should take you directly to the chunk of code where you can make a social media prediction.

# Importing packages and Cleaning Data

In [None]:

#!pip install pandas scikit-learn streamlit

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from google.colab import auth
import gspread
from google.auth import default

# Authenticate to Google
auth.authenticate_user()

# Get the credentials and create an authorized client.
creds, _ = default()
gc = gspread.authorize(creds)

# Open the spreadsheet by its title or URL
spreadsheet = gc.open('User Copy of Combined Social Media Data')

# Select the first worksheet
worksheet = spreadsheet.sheet1

# Get all values from the worksheet as a list of lists
data = worksheet.get_all_values()

# Convert the list of lists to a pandas DataFrame
import pandas as pd
df = pd.DataFrame(data[1:], columns=data[0])

# Display the first few rows of the DataFrame
#display(df.tail())

In [None]:

# Identify columns with missing values
#print(df.isnull().sum())

# Convert specified columns to numeric, coercing errors to NaN
numerical_cols_to_convert = ['Duration (sec)', 'Impressions', 'Reach', 'Plays', 'Saves', 'Likes', 'Shares', 'Comments']
for col in numerical_cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing numerical values with the mean
numerical_cols_with_missing = ['Duration (sec)', 'Impressions', 'Reach', 'Plays', 'Saves', 'Likes', 'Shares', 'Comments']
for col in numerical_cols_with_missing:
    df[col] = df[col].fillna(df[col].mean())

# Fill missing 'Description' values with an empty string
df['Description'] = df['Description'].fillna("")

# Fill missing 'Genres' and 'Content_type' with a placeholder
df['Genres'] = df['Genres'].fillna('Unknown')
df['Content_type'] = df['Content_type'].fillna('Unknown')


# Convert 'Publish_Date' and 'Time' to datetime objects
df["Publish_Date"] = pd.to_datetime(df["Publish_Date"].astype(str), errors='coerce')
df["Time"] = pd.to_datetime(df["Time"].astype(str), errors='coerce')


df['Year'] = df['Publish_Date'].dt.year
df['Month'] = df['Publish_Date'].dt.month
df['Day_of_Week'] = df['Publish_Date'].dt.dayofweek # Monday=0, Sunday=6
df['Hour'] = pd.to_datetime(df['Time'].astype(str)).dt.hour
#df['Publish_Date'] = datetime.datetime.strptime(df['Publish_Date'], "%Y-%m-%d %H:%M:%S")
#print(df['Publish_Date'].hour)

#print(df['Hour'])
#print(df["Day_of_Week"])

# Print the dtypes of the numerical columns after conversion
#print(df[numerical_cols_to_convert].dtypes)

  df["Time"] = pd.to_datetime(df["Time"].astype(str), errors='coerce')


In [None]:
from sklearn import preprocessing
#categorical_features = ["Platform", "Post type", "Genres", "Content_type", "Hour", "Month", "Day_of_Week"] # with genre and content
categorical_features = ["Platform", "Post type", "Hour", "Month", "Day_of_Week"] # without genre and content
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
one_hot_features = encoder.fit_transform(df[categorical_features])
one_hot_names = encoder.get_feature_names_out()
#print("Type of one_hot_columns is:",type(one_hot_features))

In [None]:
one_hot_df = pd.DataFrame.sparse.from_spmatrix(one_hot_features)
one_hot_df.columns = one_hot_names # Now we can see the actual meaning of the one-hot feature in the DataFrame
#one_hot_df.head()

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import scipy
import numpy as np
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['clean_caption'] = df['Description'].apply(clean_text)

# Apply TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['Description'])

#vectorizer = CountVectorizer()   # limit features if needed
#X_text = vectorizer.transform(df['clean_caption'])

#numerical_feature_names = ["Impressions", "Reach", "Shares", "Plays", "Comments", "Saves", "Year", "Month", "Day_of_Week", "Hour"] # Make Year, Month, and day categorial
numerical_feature_names = ["Duration (sec)","Reach", "Shares", "Plays", "Comments", "Saves"] # Make Year, Month, and day categorial
numerical_features = df[numerical_feature_names]

# Print dtypes to identify the problematic column
print(numerical_features.dtypes)

features = scipy.sparse.hstack((numerical_features, one_hot_features),format='csr')
features_without_numerical = scipy.sparse.csr_matrix(one_hot_features)
features_with_text = scipy.sparse.hstack((features, tfidf_matrix),format='csr')
features_with_text_without_numerical = scipy.sparse.hstack((features_without_numerical, tfidf_matrix),format='csr')
#print(feature_with_text)
all_feature_names = np.hstack((numerical_feature_names,one_hot_names))
#target_column = ['Likes']
#target = df[target_column].values
target = df['Likes']
print(target)

# Perform train and test split of data
rand_seed = 51 # For other models we will use the same random seed, so that we're always using the same train-test split
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=rand_seed) # 80 / 20 split

features_train_with_text, features_test_with_text, target_train, target_test = train_test_split(
    features_with_text, target, test_size=0.2, random_state=rand_seed)

features_train_with_text_without_numerical, features_test_with_text_without_numerical, target_train, target_test = train_test_split(
    features_with_text_without_numerical, target, test_size=0.2, random_state=rand_seed)





Duration (sec)      int64
Reach             float64
Shares              int64
Plays             float64
Comments            int64
Saves             float64
dtype: object
0        2
1        6
2        1
3        1
4        1
        ..
1297     2
1298     5
1299     2
1300     0
1301    23
Name: Likes, Length: 1302, dtype: int64


# Model Training and Accuracy Check

In [None]:
from sklearn import linear_model
ridge_fit_without_text = linear_model.RidgeCV(cv=5)
ridge_fit_without_text.fit(features_train, target_train)
ridge_fit_with_text = linear_model.RidgeCV(cv=5)
ridge_fit_with_text.fit(features_train_with_text, target_train)
ridge_fit_with_text_without_numerical = linear_model.RidgeCV(cv=5)
ridge_fit_with_text_without_numerical.fit(features_train_with_text_without_numerical, target_train)

ridge_test_score_no_text = ridge_fit_without_text.score(features_test,target_test)
ridge_test_score_with_text = ridge_fit_with_text.score(features_test_with_text,target_test)
ridge_test_score_with_text_without_numerical = ridge_fit_with_text_without_numerical.score(features_test_with_text_without_numerical,target_test)
print("Test score for Regression WITHOUT text features:", ridge_test_score_no_text)
print("Test score for Regression WITH text features:", ridge_test_score_with_text)
print("Test score for Regression WITH text features and WITHOUT numerical features:", ridge_test_score_with_text_without_numerical)

Test score for Regression WITHOUT text features: 0.8328140839043826
Test score for Regression WITH text features: 0.8323800926500413
Test score for Regression WITH text features and WITHOUT numerical features: -0.5188041471387692


# Instructions for inputing data to make a prediction:

When this block of code is ran, the user will be asked the following questions one at a time:


*   What will be the duration of the post (in seconds)? If the post is an image, enter 0.
*   Which platform will the post be on?
*   What type of post will it be? IG reel or IG image?
*   What hour of the day will the post be published (0-23)?
*   What month will the post be published (1-12)?
*   What day of the week will the post be published (0 for Monday, 6 for Sunday)?
*   Please provide a caption / description for the post:

After an answer is typed, press the enter key on your keyboard to move on to the next question. At the end, you should be provided an estimate on how many likes your post may recieve. You will also be asked if you want to make another prediction. For this question, either type in yes or no then press enter. If you type in no but want to make another prediction you will have to run this block of code again or press the Run all button again.

## Note:
*   For the platform question, the only platform the model currently takes is **Instagram**.
*   For the type of post question there are two types the model takes:
 * IG reel  
 * IG image
*   For the hour posted question, the model uses a 24 hour scale. For example:
 * 0 = 12 am
 * 23 = 11 pm
*   For the month posted question, you type in the number that correlated to the month you want to post. For example:
 * 1 = January
 * 12 = December
*   For the day of the week question, you will have to type in the number that corresponds to the day of the week you want to post.
 * 0 = Monday
 * 1 = Tuesday
 * 2 = Wednesday
 * 3 = Thursday
 * 4 = Friday
 * 5 = Saturday
 * 6 = Sunday

* When entering a description/caption, you should be able to type in what you want. If you run into any isssues with your caption being too long please let me know.
## Sample User input:

*   What will be the duration of the post (in seconds)? If the post is an image, enter 0. **0**
*   Which platform will the post be on? **Instagram**
*   What type of post will it be? IG reel or IG image? **IG reel**
*   What hour of the day will the post be published (0-23)? **18**
*   What month will the post be published (1-12)? **7**
*   What day of the week will the post be published (0 for Monday, 6 for Sunday)? **2**
*   Please provide a caption / description for the post: **Next week will be villains!!! #movies #movie #superheroes #fyp #viral #trendy #cooledtured #marvelstudios #humor #funny #comedy**

## Sample Output:
*   Estimated Likes: **9.18**

*   Rounded Likes:  **9.00**

*   Do you want to make another prediction? (yes/no): **no**






In [None]:
import scipy
import numpy as np

def predict_likes(model, encoder, tfidf, numerical_feature_names_to_collect, all_numerical_feature_names, categorical_feature_names, numerical_means):

    user_input = {}
    print("Please answer the following questions:")

    # Get specified numerical feature input
    for feature in numerical_feature_names_to_collect:
        while True:
            try:
                # Changed prompt to a question
                if feature == "Duration (sec)":
                    value = float(input(f"What will be the duration of the post (in seconds)? If the post is an image, enter 0. "))
                else:
                     value = float(input(f"What is the value for {feature}? "))
                user_input[feature] = value
                break
            except ValueError:
                print("Invalid input. Please enter a numerical value.")

    # Get categorical feature input
    for feature in categorical_feature_names:
        if feature == "Platform":
            user_input[feature] = input(f"Which platform will the post be on? ")
        elif feature == "Post type":
            user_input[feature] = input(f"What type of post will it be? IG reel or IG image? ")
        elif feature == "Hour":
             user_input[feature] = input(f"What hour of the day will the post be published (0-23)? ")
        elif feature == "Month":
             user_input[feature] = input(f"What month will the post be published (1-12)? ")
        elif feature == "Day_of_Week":
             user_input[feature] = input(f"What day of the week will the post be published (0 for Monday, 6 for Sunday)? ")
        else:
            user_input[feature] = input(f"What is the value for {feature}? ")


    # Get Description input for text feature
    user_input['Description'] = input("Please provide a caption / description for the post: ")


    # Create a DataFrame from user input, including all numerical features
    # Fill missing numerical features with their means from the training data
    user_df = pd.DataFrame([user_input])
    for feature in all_numerical_feature_names:
        if feature not in user_df.columns:
            user_df[feature] = numerical_means[feature]


    # Process categorical features using the fitted encoder
    user_categorical_features = encoder.transform(user_df[categorical_feature_names])

    # Process text feature using the fitted TF-IDF vectorizer
    user_tfidf_matrix = tfidf.transform(user_df['Description'])

    # Combine all features
    # Ensure the order of numerical features matches the training data
    user_numerical_features_sparse = scipy.sparse.csr_matrix(user_df[all_numerical_feature_names].values)
    user_features = scipy.sparse.hstack((user_numerical_features_sparse, user_categorical_features), format='csr')
    user_features_with_text = scipy.sparse.hstack((user_features, user_tfidf_matrix), format='csr')


    # Predict using the model
    predicted_likes = model.predict(user_features_with_text)

    return predicted_likes[0] # Return the single prediction

# Define the lists of feature names used during training and the ones to collect from the user
all_numerical_feature_names = ["Duration (sec)","Reach", "Shares", "Plays", "Comments", "Saves"] # Corrected to match the features used in training
numerical_feature_names_to_collect = ["Duration (sec)"] # Only collect Duration (sec) from user
#categorical_feature_names_for_prediction = ["Platform", "Post type", "Genres", "Content_type", "Hour", "Month", "Day_of_Week"] # with genre and content type
categorical_feature_names_for_prediction = ["Platform", "Post type","Hour", "Month", "Day_of_Week"] # without genre and content type

# Calculate the mean of numerical features from the training data
numerical_means = df[all_numerical_feature_names].mean().to_dict()


while True:
    estimated_likes = predict_likes(
        ridge_fit_with_text,
        encoder,
        tfidf,
        numerical_feature_names_to_collect,
        all_numerical_feature_names,
        categorical_feature_names_for_prediction,
        numerical_means
    )
    print(f"\nEstimated Likes: {estimated_likes:.2f}")
    print(f"\nRounded Likes:  {round(estimated_likes):.2f}")

    another_prediction = input(f"\nDo you want to make another prediction? (yes/no): ").lower()
    if another_prediction != 'yes':
        break

Please answer the following questions:
What will be the duration of the post (in seconds)? If the post is an image, enter 0. 0
Which platform will the post be on? Instagram
What type of post will it be? IG reel or IG image? IG reel
What hour of the day will the post be published (0-23)? 0
What month will the post be published (1-12)? 1
What day of the week will the post be published (0 for Monday, 6 for Sunday)? 2
Please provide a caption / description for the post: "Who would you pick to invite in New Year's Eve? Are you Team Tifa or Team Aerith?  Happy New Year 2022 from all of us here at cooledtured! We hope you had an amazing 2021! We sure did, thanks to all of your support over the past year. Here’s to an equally great 2022!  #FinalFantasy7 #FF7 #Tifa #Aerith #NewYear2022 #HappyNewYear #toyshop #actionfigure #toysofinstagram #toysnearme"

Estimated Likes: 13.17

Rounded Likes:  13.00

Do you want to make another prediction? (yes/no): no
