# Data Preprocessing

## Importing Libraries and Data Collection

In [1]:
import numpy as np # For working with arrays
import pandas as pd # For working with datasets
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt # For data visualization
import seaborn as sns # For statistical graphing
import re # For regular expressions
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Determines if text is a neutral, negative, or positive sentiment
from sklearn.model_selection import train_test_split # For splitting the data for regression analysis
from sklearn.preprocessing import StandardScaler # For scaling the data for regression analysis
from sklearn.linear_model import LinearRegression # Regression Analysis
from sklearn.metrics import mean_squared_error, r2_score # Further Analysis

In [2]:
boxoffice = pd.read_csv('boxoffice2014_2023.csv')
tomato_critic = pd.read_csv('rotten_tomatoes_critic_reviews.csv')
tomato_movies = pd.read_csv('rotten_tomatoes_movies.csv')
imdb_movies = pd.read_csv('IMDB_Movies.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'boxoffice2014_2023.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
boxoffice.head(5)

In [None]:
tomato_critic.head(5)

In [None]:
tomato_movies.head(5)

In [None]:
imdb_movies.head(5)

## Data Cleaning and Handling

### Checking for Data Anomalies

#### Box Office Summary

In [None]:
boxoffice.info()

In [None]:
boxoffice.describe()

#### Rotten Tomato Critic Review Summary

In [None]:
tomato_critic.info()

In [None]:
tomato_critic.describe()

#### Rotten Tomato Movies Summary

In [None]:
tomato_movies.info()

In [None]:
tomato_movies.describe()

#### IMDB Movies Summary

In [None]:
imdb_movies.info()

In [None]:
imdb_movies.describe()

### Data Cleaning

#### Finding and Handling Missing Values

In [None]:
def FindingNulls(data):
  print("Count of Initial Missing Values")
  print(data.isnull().sum())
  missing_rows = data[data.isnull().any(axis = 1)]
  print("\nRows with missing values:\n")
  print(missing_rows)

In [None]:
# Removing Categorical Null values and Interpolating(?) numerical null values
def HandlingNulls(data, interpolate_method = 'linear'):
  categorical_col = data.select_dtypes(include = ['object', 'category']).columns
  numerical_col = data.select_dtypes(include = [np.number]).columns

  nonull_data = data.dropna(subset = categorical_col)

  for column in numerical_col:
    if nonull_data[column].isnull().any():
      nonull_data[column] = nonull_data[column].interpolate(method = interpolate_method)
  nonull_data[numerical_col] = nonull_data[numerical_col].fillna(method = 'ffill').fillna(method = 'bfill')

  return nonull_data

###### Box Office Nulls

In [None]:
FindingNulls(boxoffice)

In [None]:
boxoffice = HandlingNulls(boxoffice)

In [None]:
boxoffice.isnull().sum()

In [None]:
boxoffice

##### Tomato Critic Review Nulls

In [None]:
FindingNulls(tomato_critic)

In [None]:
tomato_critic = HandlingNulls(tomato_critic)

In [None]:
tomato_critic.isnull().sum()

In [None]:
tomato_critic

##### Tomato Movie Reviews Nulls

In [None]:
FindingNulls(tomato_movies)

In [None]:
tomato_movies = HandlingNulls(tomato_movies)

In [None]:
tomato_movies.isnull().sum()

In [None]:
tomato_movies

##### IMDB Movies Nulls

In [None]:
FindingNulls(imdb_movies)

In [None]:
imdb_movies = HandlingNulls(imdb_movies)

In [None]:
imdb_movies.isnull().sum()

In [None]:
imdb_movies

#### Finding and Removing Duplicates

In [None]:
def FindDuplicates(data):
  data_duplicates = data[data.duplicated(keep = False)]
  data_duplicates.to_csv(index = False)
  print("Duplicate Rows:\n")
  print(data_duplicates)

In [None]:
def RemoveDuplicates(data, subset = None, keep = 'first'):
  duplicate_count = data.duplicated(subset = subset, keep = keep).sum()
  data_cleaned = data.drop_duplicates(subset = subset, keep = keep)
  removed_duplicates = len(data) - len(data_cleaned)
  print("Removed the duplicates from the dataset", removed_duplicates)
  data = data_cleaned
  return data_cleaned

##### Box Office Duplicates

In [None]:
FindDuplicates(boxoffice)

In [None]:
boxoffice = RemoveDuplicates(boxoffice)

In [None]:
FindDuplicates(boxoffice)

In [None]:
boxoffice

##### Tomato Critic Reviews Duplicates

In [None]:
FindDuplicates(tomato_critic)

In [None]:
tomato_critic = RemoveDuplicates(tomato_critic)

In [None]:
FindDuplicates(tomato_critic)

In [None]:
tomato_critic

###### Tomato Movies Duplicates

In [None]:
FindDuplicates(tomato_movies)

In [None]:
tomato_movies

##### IMDB Movies Duplicates

In [None]:
FindDuplicates(imdb_movies)

In [None]:
imdb_movies = RemoveDuplicates(imdb_movies)

In [None]:
FindDuplicates(imdb_movies)

In [None]:
imdb_movies

#### Cleaning Textual Data

In [None]:
def CleanTextualData(data):
  text_cols = data.select_dtypes(include = ['object', 'string']).columns
  def CleanText(text):
    if isinstance(text, str):
      text = text.lower()
      text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
      text = re.sub(r'\s+', ' ', text).strip()
    return text
  for column in text_cols:
    data[column] = data[column].apply(CleanText)
  return data

##### Box Office Text Cleaning

In [None]:
boxoffice_cleaned = CleanTextualData(boxoffice)
boxoffice_cleaned

##### Tomato Critic Reviews Text Cleaning

In [None]:
tomato_critic_cleaned = CleanTextualData(tomato_critic)
tomato_critic_cleaned

##### Tomato Movies Text Cleaning

In [None]:
tomato_movies_cleaned = CleanTextualData(tomato_movies)
tomato_movies_cleaned

##### IMDB Movies Text Cleaning

In [None]:
imdb_movies_cleaned = CleanTextualData(imdb_movies)
imdb_movies_cleaned

## Feature Engineering

### Finding Correlated Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder

In [None]:
sia = SentimentIntensityAnalyzer()
positive_words = ['good', 'great', 'excellent', 'awesome', 'amazing', 'love', 'fantastic', 'positive']
negative_words = ['bad', 'poor', 'terrible', 'awful', 'hate', 'negative', 'worst', 'disappointing']

In [None]:
def SentimentAnalysis(text):
  if not isinstance(text, str):
    return 0 # Neutral sentiment for non-string inputs
  text = text.lower()
  pos_count = sum(word in text for word in positive_words)
  neg_count = sum(word in text for word in negative_words)
  return pos_count - neg_count

def AddTextFeatures(df, text_col):
  if text_col in df.columns:
    df['sentiment'] = df[text_col].apply(SentimentAnalysis)
    df['text_length'] = df[text_col].apply(lambda x: len(str(x)))
    df['word_count'] = df[text_col].apply(lambda x: len(str(x).split()))
    return df
  else:
    print(f"Column '{text_col}' not found in the DataFrame.")
    return df

def AddTfidfFeatures(df, text_col, max_features = 12):
  if text_col in df.columns:
    vectorizer = TfidfVectorizer(max_features = max_features)
    tfidf_matrix = vectorizer.fit_transform(df[text_col].astype(str))
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = [f'tfidf_{i}' for i in range(max_features)])
    df = pd.concat([df.reset_index(drop = True), tfidf_df.reset_index(drop = True)], axis = 1)
  return df

def EncodeCategorical(df):
  label_enc = LabelEncoder()
  for col in df.select_dtypes(include = ['object']).columns:
    df[col] = label_enc.fit_transform(df[col])
  return df

def AddDateFeatures(df, date_col):
  if date_col in df.columns:
    df[date_col] = pd.to_datetime(df[date_col], errors = "coerce")
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df.drop(columns = [date_col], inplace = True)
  return df

def StandardizeNumerical(df):
  scaler = StandardScaler()
  numerical_cols = df.select_dtypes(include = [np.number]).columns
  df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
  return df

In [None]:
def ProcessedDataset(df, text_col = None, date_col = None, max_tfidf_features = 12):
  if text_col:
    df = AddTextFeatures(df, text_col)
    df = AddTfidfFeatures(df, text_col, max_features = max_tfidf_features)
    df = EncodeCategorical(df)
  if date_col:
    df = AddDateFeatures(df, date_col)
  df = StandardizeNumerical(df)
  return df

In [None]:
tomato_critic_feat_eng = ProcessedDataset(tomato_critic_cleaned, text_col = 'review_content', date_col = 'review_date')
tomato_movies_feat_eng = ProcessedDataset(tomato_movies_cleaned, text_col = 'critic_consensus', date_col = 'release_date')
imdb_movies_feat_eng = ProcessedDataset(imdb_movies_cleaned, text_col = 'review_text', date_col = 'release_date')

In [None]:
tomato_critic_feat_eng

In [None]:
tomato_movies_feat_eng

In [None]:
imdb_movies_feat_eng

In [None]:
def CorrelationMatrix(data, data_name = "Dataset", font_size = 10):
  corr_matrix = data.corr()
  mask = np.triu(np.ones_like(corr_matrix, dtype = bool))

  print(f"Correlation Matrix for {data_name}")
  print("\n")

  plt.figure(figsize = (12, 10))
  sns.heatmap(corr_matrix, mask = mask, annot = True, cmap = 'coolwarm', fmt = '.2f', cbar = True, annot_kws = {"size": font_size})
  plt.title(f"Correlation Matrix Heatmap: {data_name}", fontsize = font_size + 4)
  plt.xticks(fontsize = font_size)
  plt.yticks(fontsize = font_size)
  plt.show()

  return corr_matrix


In [None]:
CorrelationMatrix(tomato_critic_feat_eng, "Tomato Critic Reviews")

In [None]:
CorrelationMatrix(tomato_movies_feat_eng, "Tomato Movies")

In [None]:
CorrelationMatrix(imdb_movies_feat_eng, "IMDB Movies", font_size = 8)

# Model Building



## **Sentiment Analysis**

> Add blockquote



In [None]:
# Install Hugging Face Transformers
!pip install transformers

# Import pipeline for sentiment analysis
from transformers import pipeline

# Load the emotion classification model
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Test the model
print(classifier("This movie is fantastic!"))


In [None]:
# Function to classify emotions in a dataset
def classify_reviews(data, text_column):
    # Apply the classifier to the text column
    data['emotion_scores'] = data[text_column].apply(classifier)
    return data

# Apply to the critic reviews dataset (adjust 'review_text' to match your dataset's column name)
# This will generate emotion scores for each review
tomato_critic_with_emotions = classify_reviews(tomato_critic_cleaned, 'review')

# Display the dataset with the new emotion scores
tomato_critic_with_emotions.head()


### Comparison Plots

In [None]:
import matplotlib.pyplot as plt

def year(x):
  return x[:4]

def boxPlotByYear(data, name = ''):
  fig, ax = plt.subplots(figsize =(15, 4))
  bp = ax.boxplot(data)
  ax.set_xticklabels(range(2006,2017))
  plt.title(name)
  plt.show()

runtimeDataR = []
audienceCountDataR = []
expertCountDataR = []
scoreDataR = []
runtimeDataI = []
audienceCountDataI = []
expertCountDataI = []
scoreDataI = []

for curYear in range(2006,2017):
  tomato_box = tomato_movies.loc[tomato_movies['original_release_date'].apply(year) == str(curYear), ['movie_title', 'original_release_date', 'runtime', 'tomatometer_rating', 'audience_rating','tomatometer_count','audience_count']]
  imbd_box = imdb_movies.loc[imdb_movies['title_year'] == curYear, ['movie_title', 'title_year', 'duration', 'num_critic_for_reviews','num_voted_users','imdb_score']]
  tomato_box['overall_score'] = (tomato_box['tomatometer_rating'] + tomato_box['audience_rating']) * 9 / 200 + 1
  tomato_box.drop(columns = ['tomatometer_rating', 'audience_rating'], inplace = True)
  runtimeDataR.append(tomato_box['runtime'])
  audienceCountDataR.append(tomato_box['audience_count'])
  expertCountDataR.append(tomato_box['tomatometer_count'])
  scoreDataR.append(tomato_box['overall_score'])
  runtimeDataI.append(imbd_box['duration'])
  audienceCountDataI.append(imbd_box['num_voted_users'].apply(int))
  expertCountDataI.append(imbd_box['num_critic_for_reviews'])
  scoreDataI.append(imbd_box['imdb_score'])

boxPlotByYear(runtimeDataR, "Rotton Tomatoes Runtime Distribution")
boxPlotByYear(runtimeDataI, "IMDB Runtime Distribution")
boxPlotByYear(audienceCountDataR, "Number of audience reviews in Rotton Tomatoes")
boxPlotByYear(audienceCountDataI, "Number of audience reviews in IMDB")
boxPlotByYear(expertCountDataR, "Number of expert reviews in Rotton Tomatoes")
boxPlotByYear(expertCountDataI, "Number of expert reviews in IMDB")
boxPlotByYear(scoreDataR, "Rotton Tomatoes adjusted score")
boxPlotByYear(scoreDataI, "IMDB Score")


### Sentiment analysis comparison

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

In [None]:
import csv
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex

sentimentSets = []
for CurYear in range(2006,2017):
  TomatoCriticInit = tomato_critic.copy()
  TomatoCriticInit['year'] = TomatoCriticInit['review_date'].apply(year).apply(int)
  TomatoCriticInit = TomatoCriticInit.loc[TomatoCriticInit['year'] == CurYear]
  #print(ElonInit.head())
  #TomatoCriticInit['text'] = TomatoCriticInit['review_content'].str.replace(r"^\"?b['\"]\"?\s+|\s+['\"]\"?$|^\"?b['\"]\"?|'$|^\s+|\s+$", '', regex=True)
  #TomatoCriticInit['text'] = TomatoCriticInit['text'].str.replace(r'([\s"\'])@\w+', '', regex=True)
  #TomatoCriticInit['text'] = TomatoCriticInit['text'].str.replace(r'^@\w+\s', '', regex=True)
  TomatoCriticInit['text'] = TomatoCriticInit['review_content'].str.replace(r'\\xe2\\x..\\x..\s|\s?\\xe2\\x..\\x..', '', regex=True)
  TomatoCriticInit['text'] = TomatoCriticInit['text'].str.translate(str.maketrans('', '', string.punctuation))
  TomatoCriticInit['text'] = TomatoCriticInit['text'].str.replace(r'\s?http\S+', '', regex=True)

  TomatoCriticInit['texttokens'] = TomatoCriticInit['text'].apply(word_tokenize)
  TomatoCriticInit['texttokens'] = TomatoCriticInit['texttokens'].apply(lambda x: [word.lower() for word in x])
  stop_words = set(stopwords.words('english'))
  TomatoCriticInit['texttokens'] = TomatoCriticInit['texttokens'].apply(lambda x: [word for word in x if word not in stop_words])
  lemmatizer = WordNetLemmatizer()
  TomatoCriticInit['texttokens'] = TomatoCriticInit['texttokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
  ps = PorterStemmer()
  TomatoCriticInit['texttokens'] = TomatoCriticInit['texttokens'].apply(lambda x: [ps.stem(word) for word in x])

  FinSentences = TomatoCriticInit['texttokens'].apply(lambda x: " ".join(x))
  FinText = " ".join(FinSentences)

  #Vectorizers for ngrams of 1, 2, and 3, as well as a TF-IDF vectorizer
  vectorizer = CountVectorizer(analyzer='word',stop_words='english')
  ElonWordsVector = vectorizer.fit_transform([FinText])
  vectorizer2 = CountVectorizer(analyzer='word',ngram_range=(2,2),stop_words='english')
  ElonWordsVector2 = vectorizer2.fit_transform([FinText])
  vectorizer3 = CountVectorizer(analyzer='word',ngram_range=(3,3),stop_words='english')
  ElonWordsVector3 = vectorizer2.fit_transform([FinText])
  TVectorizer = TfidfVectorizer()
  ElonWordsTVector = TVectorizer.fit_transform(FinSentences)
  feature_names = vectorizer2.get_feature_names_out()
  #Print data
  frequencies = ElonWordsVector2.sum(axis=0).A1

  # Create a DataFrame to store feature names and frequencies
  df = pd.DataFrame({'feature': feature_names, 'frequency': frequencies})

  # Sort the DataFrame by frequency in descending order
  df = df.sort_values('frequency', ascending=False)
  print(f"{CurYear} Dictionary:")
  print(df.head(n=10))


  #Word cloud
  #wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(FinText)
  #plt.figure(facecolor = None)
  #plt.imshow(wordcloud)
  #plt.axis("off")
  #plt.tight_layout(pad = 0)
  #plt.title(f"{CurYear}'s Wordcloud",fontsize=20)
  #analysis = []
  #plt.show()
  #Sentiment analysis
  #for sentence in FinSentences:
  #    analysis.append(TextBlob(sentence).sentiment.polarity * 9 / 2 + 5.5)
  #TomatoCriticInit['sentiment'] = analysis
  #sentimentSets.append(TomatoCriticInit['sentiment'])
  #print(TextBlob(FinText).sentiment.polarity)
  #sia = SentimentIntensityAnalyzer()
  #TomatoCriticInit['sentiment2'] = TomatoCriticInit['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
  #print(sia.polarity_scores(FinText))
  #TomatoCriticInit['sentiment3'] = FinSentences.apply(lambda x: NRCLex(x).affect_frequencies)
  #print(NRCLex(FinText).affect_frequencies)
  #print(TomatoCriticInit.head())
#boxPlotByYear(sentimentSets, "Rotton Tomatoes Sentiment Score Distribution")

In [None]:
# Import required libraries
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import string

# Define genre-defining terms
genre_terms = {
    "Action": ["fight", "explosion", "battle"],
    "Comedy": ["funny", "laugh", "joke"],
    "Drama": ["emotional", "character", "dialogue"],
    "Horror": ["scary", "fear", "monster"],
    "Sci-Fi": ["alien", "future", "space"]
}

# Load the critic reviews dataset
rt_critic_reviews = pd.read_csv('rotten_tomatoes_critic_reviews.csv')

# Extract year from review_date and filter for years >= 2000
rt_critic_reviews['year'] = pd.to_datetime(rt_critic_reviews['review_date'], errors='coerce').dt.year
rt_critic_reviews = rt_critic_reviews[rt_critic_reviews['year'] >= 2000]

# Text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply text cleaning
rt_critic_reviews['processed_text'] = rt_critic_reviews['review_content'].fillna('').apply(clean_text)

# Aggregate frequencies by year for genre terms
genre_frequencies = {genre: [] for genre in genre_terms}
years = sorted(rt_critic_reviews['year'].dropna().unique())

for year in years:
    yearly_reviews = rt_critic_reviews[rt_critic_reviews['year'] == year]['processed_text']
    word_counts = Counter(" ".join(yearly_reviews).split())

    for genre, terms in genre_terms.items():
        genre_frequencies[genre].append(sum(word_counts.get(term, 0) for term in terms))

# Convert to a DataFrame for visualization
genre_trends_df = pd.DataFrame(genre_frequencies, index=years)
genre_trends_df.index.name = 'Year'

# Plot the genre frequency trends
genre_trends_df.plot(figsize=(12, 6), marker='o')
plt.title("Genre Frequency Trends (2000 and Later)")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.legend(title="Genres", bbox_to_anchor=(1.5, 1), loc='upper left')
plt.tight_layout()
plt.show()
