# Ad Videos Sentiment Analysis: Logistic Regression, TF-IDF, and Predictive Analytics.
## Mohit Ravindra Kamble

## Northeastern University
## July 15, 2024

In [None]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load datasets
data = pd.read_csv("/content/Sample.csv")
ground_data = pd.read_csv("/content/ground-truth.xlsx - Form Responses 1.csv")

# Merging above datasets
merged_data = pd.merge(data[["creative_data_id", "creative_data_title", "creative_data_description", "speech"]], ground_data, how="left", on=["creative_data_id"]).reset_index()
# Drop unwanted columns
merged_data.drop(columns=['If "yes" to the above, which of the following emotions is closest to the emotion that the ad was intending the viewer to feel? (Select all that apply.)', 'Timestamp', 'index', 'creative_data_title', 'Please enter the video identifier one more time (e.g. 123456789.mp4)', 'Any additional feedback or things we should be aware of? ', 'After addressing the specific survey items, write a general description of the ad. You can use answers to the questions above to formulate your answer. Your description should include:\nBrand and Product Identification: \nSpecify the brand and whether a product is being advertised. (1 sentence)\nVisual Elements: Describe what is seen on the screen, including setting, characters, and any text or graphics. (max 2 sentences)\nAuditory Elements: Note what is heard, such as dialogue, voice-over, music, or sound effects. (max 2 sentences)\n', 'If yes to the above, did the ad successfully affect you emotionally, as intended?', 'If yes to the above, did the ad successfully affect you emotionally, as intended?', 'If yes to the above, was the ad successfully funny, as intended?', 'Was there a famous person in this ad? ', 'If yes to the above, write the name of the famous person, if known.', 'What happened in this ad? (Answer in 2-3 sentences each)', 'What was/were the company\'s goal(s) with this ad? Choose (potentially multiple) from:', 'How successful was the ad in achieving its goal(s)?', 'How much did you like the ad? (1. Strongly dislike, 2. Dislike, 3. Neither Like or Dislike, 4. Like, 5. Strongly Like)', 'What was the slogan presented in the ad, if any?'], inplace=True)

# Replace Yes/No to 1/0
for question in merged_data.columns[3:]:
  merged_data[question] = merged_data[question].replace(to_replace=r'(?:Yes)', value=1, regex=True)
  merged_data[question] = merged_data[question].replace(to_replace=r'(?:No)', value=0, regex=True)

# Group by merged_data with mode value of questions
merged_data = merged_data.groupby(by=['creative_data_id', 'creative_data_description', 'speech']).agg( lambda x: pd.Series.mode(x)[0] ).reset_index()

In [None]:
merged_data.columns

Index(['creative_data_id', 'creative_data_description', 'speech',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo pro

In [None]:
merged_data.head()

Unnamed: 0,creative_data_id,creative_data_description,speech,"Is there a call to go online (e.g., shop online, visit the Web)?","Is there online contact information provided (e.g., URL, website)?","Is there a visual or verbal call to purchase (e.g., buy now, order now)?","Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?","Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?","Is there offline contact information provided (e.g., phone, mail, store location)?",Is there mention of something free?,...,"Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)",Does the ad give you a positive feeling about the brand?,"Does the ad have a story arc, with a beginning and an end?","Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?",Does the ad have relatable characters?,Is the ad creative/clever?,"Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.)","Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)?",Is the ad visually pleasing?,"Does the ad have cute elements like animals, babies, animated, characters, etc?"
0,1471363,"The new MINI Countryman is the largest yet, pr...",It's another pure gray morning. Don't know wha...,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,1488315,What would you do if the end of the world was ...,The end of civilization is upon us. Hold your ...,0,0,0,0,0,0,0,...,1,1,1,0,0,1,0,1,0,0
2,1526213,As a man speeds down a country road in his Aud...,Audi presens can help prepare for and in some ...,0,0,0,0,1,1,0,...,0,1,0,0,0,1,0,1,1,0
3,1548815,"On an otherwise peaceful day, two giant monste...",The new Honda Odyssey has tons of available sm...,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,0,1,1
4,1624211,Chevy's spokesperson lists off all the feature...,Hi guys. So this is the all new Chevy Equinox....,0,0,0,0,0,0,0,...,1,1,0,0,0,1,0,0,0,0


In [None]:
# Function to clean text data
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Convert to lower case
    text = text.lower()
    return text

In [None]:
# Apply the cleaning function to the speech column
data['cleaned_speech'] = data['speech'].apply(clean_text)

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the cleaned speech data
X = tfidf_vectorizer.fit_transform(data['cleaned_speech'])

In [None]:
# Dictionary to store results
metrics = {}

# Result dataframe
result_df = pd.DataFrame(columns=merged_data.columns)

result_df['creative_data_id'] = merged_data['creative_data_id']
result_df['creative_data_description'] = merged_data['creative_data_description']
result_df['speech'] = merged_data['speech']

# Train and evaluate a model for each question
for question in merged_data.columns[3:]:
    y = merged_data[question]
    X_train = X
    y_train = y

    # Logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_train, y_pred, average='weighted')
    f1 = f1_score(y_train, y_pred, average='weighted')

    # Store the metrics
    metrics[question] = {
        'agreement_percentage': accuracy,
        'precision': precision,
        'f1_score': f1,
        'recall': recall
    }

    result_df[question] = ['Yes' if x == 1 else 'No' for x in y_pred]

In [None]:
# # Printing metrics for each question
# i = 0
# for merged_questions, metric in metrics.items():
#     i += 1
#     print(f"Metrics for question {i} --> '{merged_questions}':")
#     print(f"  Agreement Percentage: {metric['accuracy']}")
#     print(f"  Precision: {metric['precision']}")
#     print(f"  Recall: {metric['recall']}")
#     print(f"  F1 Score: {metric['f1_score']}")
#     print("\n")

round(pd.DataFrame(metrics).T,2)

Unnamed: 0,agreement_percentage,precision,f1_score,recall
"Is there a call to go online (e.g., shop online, visit the Web)?",0.74,0.55,0.63,0.74
"Is there online contact information provided (e.g., URL, website)?",0.97,0.97,0.97,0.97
"Is there a visual or verbal call to purchase (e.g., buy now, order now)?",0.69,0.79,0.61,0.69
"Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?",0.75,0.57,0.65,0.75
"Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?",0.91,0.92,0.91,0.91
"Is there offline contact information provided (e.g., phone, mail, store location)?",0.83,0.69,0.76,0.83
Is there mention of something free?,0.93,0.86,0.89,0.93
"Does the ad mention at least one specific product or service (e.g., model, type, item)?",0.84,0.71,0.77,0.84
Is there any verbal or visual mention of the price?,0.71,0.8,0.64,0.71
"Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the ""swoosh"" logo prominently displayed on shoes and apparel worn by celebrity athletes. The ""Just Do It"" slogan is another Nike trademark frequently included.",0.85,0.72,0.78,0.85


In [None]:
# Drop unwanted columns
result_df.drop(columns=['creative_data_description', 'speech'], inplace=True)

# Exporting csv
result_df.to_csv("/content/results.csv", index=False)

In [None]:
!pip freeze > "/content/requirements.txt"