In [1]:
import pandas as pd

In [4]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/
# %pwd

In [3]:
df = pd.read_csv(
    "final_cleaned_df.csv",
    delimiter=",",
    quotechar='"',
    escapechar="\\",
    engine="python"
)

In [7]:
from datasets import Dataset
from transformers import BertTokenizer, set_seed
import random
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
import os
os.environ["PYTHONHASHSEED"] = "42"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(False)

In [None]:
print(df.iloc[0]['label'].astype(int))

1


In [None]:
df['label'] = df['label'].astype(int)

In [None]:
print(f"Dataset has {len(df)} rows after processing")
print("Label counts:")
print(df['label'].value_counts())

Dataset has 30635 rows after processing
Label counts:
label
1    15351
0    15284
Name: count, dtype: int64


In [None]:
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['tconst', 'movie_title', 'year', 'numVotes', 'label', 'genre', 'content_rating', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'audience_status', 'audience_rating', 'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'],
    num_rows: 30635
})


In [None]:
dataset = Dataset.from_pandas(df)
print(dataset)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def tokenize(examples):
    # Make sure we have valid texts in the 'reviews' column
    texts = examples['reviews']

    # Replace any None or NaN values with empty strings to avoid tokenization errors
    texts = [str(text) if text is not None else "" for text in texts]

    return tokenizer(texts, padding='max_length', truncation=True)

Dataset({
    features: ['tconst', 'movie_title', 'year', 'numVotes', 'label', 'genre', 'content_rating', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'audience_status', 'audience_rating', 'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'],
    num_rows: 30635
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenized = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/30635 [00:00<?, ? examples/s]

In [None]:
train_test_val = tokenized.train_test_split(test_size=0.2)
train_dataset = train_test_val['train']
val_dataset = train_test_val['test']

In [None]:
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup, EarlyStoppingCallback

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

num_epochs = 2
batch_size = 16

total_steps = len(train_dataset) * num_epochs // batch_size
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



In [None]:
training_args = TrainingArguments(
    report_to="none",
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    #fp16=True,
    weight_decay=0.01,
)
# Define Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5264,0.546452,0.737392,0.681575,0.846192,0.570577
2,0.4189,0.455643,0.786519,0.779501,0.793411,0.76607


TrainOutput(global_step=3064, training_loss=0.519610775979941, metrics={'train_runtime': 5183.2386, 'train_samples_per_second': 9.457, 'train_steps_per_second': 0.591, 'total_flos': 1.289665148952576e+16, 'train_loss': 0.519610775979941, 'epoch': 2.0})

In [None]:
metrics

{'eval_loss': 0.45564281940460205,
 'eval_accuracy': 0.7865186877754202,
 'eval_f1': 0.7795010114632501,
 'eval_precision': 0.7934111187371311,
 'eval_recall': 0.7660702451954937,
 'eval_runtime': 198.7024,
 'eval_samples_per_second': 30.835,
 'eval_steps_per_second': 1.928,
 'epoch': 2.0}

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.45564281940460205, 'eval_accuracy': 0.7865186877754202, 'eval_f1': 0.7795010114632501, 'eval_precision': 0.7934111187371311, 'eval_recall': 0.7660702451954937, 'eval_runtime': 198.7024, 'eval_samples_per_second': 30.835, 'eval_steps_per_second': 1.928, 'epoch': 2.0}


In [None]:
output_dir = "./sentiment_model/"
model_path = os.path.join(output_dir, "model")
tokenizer_path = os.path.join(output_dir, "tokenizer")

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)
print(f"Model saved to {model_path}")
print(f"Tokenizer saved to {tokenizer_path}")

Model saved to ./sentiment_model/model
Tokenizer saved to ./sentiment_model/tokenizer


In [None]:
import shutil
shutil.make_archive("sentiment_model", 'zip', output_dir)
from google.colab import files
files.download("sentiment_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TEXT TO USE THE MODEL DOWNLOADED

In [9]:
from transformers import BertForSequenceClassification, BertTokenizer
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace, lower, trim, lit, coalesce, array, concat_ws, concat, split

from pyspark.ml import Pipeline

# # Load model and tokenizer
# loaded_model = BertForSequenceClassification.from_pretrained("sentiment_model/model")
# loaded_tokenizer = BertTokenizer.from_pretrained("sentiment_model/tokenizer")

# # Use the model for inference
# inputs = loaded_tokenizer("disgusting", return_tensors="pt")
# outputs = loaded_model(**inputs)
# predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
# print(predictions)  # [negative_prob, positive_prob]

In [10]:
spark = SparkSession.builder \
    .appName("BERT-Sentiment") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [11]:
from transformers import BertForSequenceClassification, BertTokenizer

model_name = "lvwerra/bert-imdb"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

loaded_model = BertForSequenceClassification.from_pretrained(model_name).to(device)
loaded_tokenizer = BertTokenizer.from_pretrained(model_name)

loaded_model.eval()

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [24]:
inputs = loaded_tokenizer("amazing", return_tensors="pt").to(device)
outputs = loaded_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0795, 0.9205]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [None]:
# df2 = spark.read.csv("final_cleaned_df.csv", header=True)

In [25]:
def predict_sentiment(text):
    if pd.isna(text) or len(text.strip()) == 0:
        return 3  # Neutral if empty

    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = loaded_model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    negative_prob = probabilities[0][0].item()
    positive_prob = probabilities[0][1].item()

    if negative_prob >= 0.9:
        return 1  # Very Negative
    elif negative_prob >= 0.7:
        return 2  # Negative
    elif 0.3 <= positive_prob <= 0.7:
        return 3  # Neutral
    elif positive_prob >= 0.7:
        return 4  # Positive
    elif positive_prob >= 0.9:
        return 5  # Very Positive
    else:
        return 3

In [26]:
reviews_df = df.copy()
#reviews_df = reviews_df[["tconst", "review_lemmatized"]]

reviews_df["bert_sentiment"] = reviews_df["review_lemmatized"].apply(predict_sentiment)

# Show processed data
reviews_df.head()

Unnamed: 0,tconst,movie_title,year,numVotes,label,genre,content_rating,production_company,tomatometer_status,tomatometer_rating,audience_status,audience_rating,review_score,like_count,label_int,reviews,review_lemmatized,bert_sentiment
0,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"THE LITTLE COLONEL (Fox, 1935), directed by Da...",littl colonel fox 1935 direct david butler sta...,4
1,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,It's odd that Shirley Temple made two similar ...,odd shirley templ made two similar movi year i...,4
2,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"With all of her usual show-stealing spark, Shi...",usual show steal spark shirley templ deliv ano...,4
3,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,There will never be a child star to match Shir...,never child star match shirley templ born actr...,4
4,tt0017961,happiness,1935,1080,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"I really love silent cinema of all types, and ...",realli love silent cinema type favorit film si...,4


In [27]:
reviews_df.to_csv("reviews_with_sentiment.csv", index=False)

### **With our model**

In [14]:
# Load model and tokenizer
loaded_self_model = BertForSequenceClassification.from_pretrained("sentiment_model/model").to(device)
loaded_self_tokenizer = BertTokenizer.from_pretrained("sentiment_model/tokenizer")

# Use the model for inference
# inputs = loaded_tokenizer("disgusting", return_tensors="pt")
# outputs = loaded_model(**inputs)
# predictions_self = torch.nn.functional.softmax(outputs.logits, dim=-1)
# print(predictions)  # [negative_prob, positive_prob]

In [15]:
def predict_self_sentiment(text):
    if pd.isna(text) or len(text.strip()) == 0:
        return 3  # Neutral if empty

    inputs = loaded_self_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = loaded_self_model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    negative_prob = probabilities[0][0].item()
    positive_prob = probabilities[0][1].item()

    # Map probabilities to sentiment score (1-5)
    if negative_prob >= 0.9:
        return 1  # Very Negative
    elif negative_prob >= 0.7:
        return 2  # Negative
    elif 0.3 <= positive_prob <= 0.7:
        return 3  # Neutral
    elif positive_prob >= 0.7:
        return 4  # Positive
    elif positive_prob >= 0.9:
        return 5  # Very Positive
    else:
        return 3  # Default to Neutral

df2 = df.copy()
# Apply sentiment analysis
df2["self_sentiment"] = df2["review_lemmatized"].apply(predict_self_sentiment)

# Show processed data
df2.head()

Unnamed: 0,tconst,movie_title,year,numVotes,label,genre,content_rating,production_company,tomatometer_status,tomatometer_rating,audience_status,audience_rating,review_score,like_count,label_int,reviews,review_lemmatized,self_sentiment
0,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"THE LITTLE COLONEL (Fox, 1935), directed by Da...",littl colonel fox 1935 direct david butler sta...,4
1,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,It's odd that Shirley Temple made two similar ...,odd shirley templ made two similar movi year i...,3
2,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"With all of her usual show-stealing spark, Shi...",usual show steal spark shirley templ deliv ano...,4
3,tt0016029,the little colonel,1935,1646,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,There will never be a child star to match Shir...,never child star match shirley templ born actr...,3
4,tt0017961,happiness,1935,1080,True,Unknown,Unknown,Unknown,1,78,1,80,Unknown,0.0,1,"I really love silent cinema of all types, and ...",realli love silent cinema type favorit film si...,4


In [16]:
df2.to_csv("reviews_with_self_sentiment.csv", index=False)

## **Add missing columns to val and test**

In [28]:
df_reviews_bert = pd.read_csv(
    "reviews_with_sentiment.csv"
)

In [29]:
df_reviews_self = pd.read_csv(
    "reviews_with_self_sentiment.csv"
)

### **BERT IMDB**

In [40]:
reviews_bert = df_reviews_bert[["tconst", "reviews", "review_lemmatized"]]
df_bert = df_reviews_bert.copy().drop(columns=["reviews", "review_lemmatized"])

numerical_features = ["numVotes", "like_count",
                      "tomatometer_status", "tomatometer_rating",
                      "audience_status", "audience_rating", "like_count", "bert_sentiment"]

# Define categorical features (take first occurrence)
categorical_features = ["movie_title", "year", "genre", "content_rating", "production_company", "label"]

# Aggregate numerical & categorical features
df_grouped = df_bert.groupby("tconst", as_index=False).agg(
    {**{col: "mean" for col in numerical_features},
     **{col: "first" for col in categorical_features}}
)

# Aggregate reviews (concatenate all reviews for each movie)
df_reviews = reviews_bert.groupby("tconst", as_index=False).agg({
    "reviews": lambda x: " || ".join(x.dropna().astype(str)),
    "review_lemmatized": lambda x: " || ".join(x.dropna().astype(str))
})

# Merge reviews into grouped dataset
df_grouped_bert = df_grouped.merge(df_reviews, on="tconst", how="left")
df_grouped_bert

Unnamed: 0,tconst,numVotes,like_count,tomatometer_status,tomatometer_rating,audience_status,audience_rating,bert_sentiment,movie_title,year,genre,content_rating,production_company,label,reviews,review_lemmatized
0,tt0016029,1646.0,0.0,1.0,78.0,1.0,80.0,4.00,the little colonel,1935,Unknown,Unknown,Unknown,True,"THE LITTLE COLONEL (Fox, 1935), directed by Da...",littl colonel fox 1935 direct david butler sta...
1,tt0017961,1080.0,0.0,1.0,78.0,1.0,80.0,3.00,happiness,1935,Unknown,Unknown,Unknown,True,"I really love silent cinema of all types, and ...",realli love silent cinema type favorit film si...
2,tt0020298,3226.0,0.0,1.0,78.0,1.0,80.0,2.25,queen kelly,1932,Unknown,Unknown,Unknown,True,I'd imagine that most people who would come to...,d imagin peopl come page read review erich von...
3,tt0020768,3199.0,0.0,1.0,78.0,1.0,80.0,3.50,city girl,1930,Unknown,Unknown,Unknown,True,Silent film may be the only unique art form ev...,silent film may uniqu art form ever flourish b...
4,tt0021309,0.0,0.0,1.0,78.0,1.0,80.0,3.25,the story of the fox,1937,Unknown,Unknown,Unknown,True,One of the first animated feature films of the...,one first anim featur film world tale fox stil...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,tt9850344,1325.0,0.0,1.0,78.0,1.0,80.0,4.00,night shift,2020,Unknown,Unknown,Unknown,False,It would have been much better to have a longe...,much better longer first part unlik second par...
7808,tt9850386,4144.0,0.0,1.0,78.0,1.0,80.0,3.75,the bee gees: how can you mend a broken heart,2020,Unknown,Unknown,Unknown,True,"""The Bee Gees: How Can You Mend A Broken Heart...",bee gee mend broken heart 2020 releas 111 min ...
7809,tt9900782,0.0,0.0,1.0,78.0,1.0,80.0,4.00,kaithi,2019,Unknown,Unknown,Unknown,True,"A drug burst, an injured cop and a Convicted C...",drug burst injur cop convict crimin want meet ...
7810,tt9904802,0.0,0.0,1.0,78.0,1.0,80.0,1.75,enemy lines,2020,Unknown,Unknown,Unknown,False,"Is my first impression, its the lack of determ...",first impress lack determin determin whole plo...


### **OUR BERT MODEL**

In [41]:
reviews_self = df_reviews_self[["tconst", "reviews", "review_lemmatized"]]
df_self = df_reviews_self.copy().drop(columns=["reviews", "review_lemmatized"])

numerical_features = ["numVotes", "like_count",
                      "tomatometer_status", "tomatometer_rating",
                      "audience_status", "audience_rating", "like_count", "self_sentiment"]

# Define categorical features (take first occurrence)
categorical_features = ["movie_title", "year", "genre", "content_rating", "production_company", "label"]

# Aggregate numerical & categorical features
df_grouped = df_self.groupby("tconst", as_index=False).agg(
    {**{col: "mean" for col in numerical_features},
     **{col: "first" for col in categorical_features}}
)

# Aggregate reviews (concatenate all reviews for each movie)
df_reviews = reviews_self.groupby("tconst", as_index=False).agg({
    "reviews": lambda x: " || ".join(x.dropna().astype(str)),
    "review_lemmatized": lambda x: " || ".join(x.dropna().astype(str))
})

# Merge reviews into grouped dataset
df_grouped_self = df_grouped.merge(df_reviews, on="tconst", how="left")
df_grouped_self

Unnamed: 0,tconst,numVotes,like_count,tomatometer_status,tomatometer_rating,audience_status,audience_rating,self_sentiment,movie_title,year,genre,content_rating,production_company,label,reviews,review_lemmatized
0,tt0016029,1646.0,0.0,1.0,78.0,1.0,80.0,3.50,the little colonel,1935,Unknown,Unknown,Unknown,True,"THE LITTLE COLONEL (Fox, 1935), directed by Da...",littl colonel fox 1935 direct david butler sta...
1,tt0017961,1080.0,0.0,1.0,78.0,1.0,80.0,3.25,happiness,1935,Unknown,Unknown,Unknown,True,"I really love silent cinema of all types, and ...",realli love silent cinema type favorit film si...
2,tt0020298,3226.0,0.0,1.0,78.0,1.0,80.0,3.00,queen kelly,1932,Unknown,Unknown,Unknown,True,I'd imagine that most people who would come to...,d imagin peopl come page read review erich von...
3,tt0020768,3199.0,0.0,1.0,78.0,1.0,80.0,3.75,city girl,1930,Unknown,Unknown,Unknown,True,Silent film may be the only unique art form ev...,silent film may uniqu art form ever flourish b...
4,tt0021309,0.0,0.0,1.0,78.0,1.0,80.0,3.25,the story of the fox,1937,Unknown,Unknown,Unknown,True,One of the first animated feature films of the...,one first anim featur film world tale fox stil...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,tt9850344,1325.0,0.0,1.0,78.0,1.0,80.0,3.00,night shift,2020,Unknown,Unknown,Unknown,False,It would have been much better to have a longe...,much better longer first part unlik second par...
7808,tt9850386,4144.0,0.0,1.0,78.0,1.0,80.0,3.50,the bee gees: how can you mend a broken heart,2020,Unknown,Unknown,Unknown,True,"""The Bee Gees: How Can You Mend A Broken Heart...",bee gee mend broken heart 2020 releas 111 min ...
7809,tt9900782,0.0,0.0,1.0,78.0,1.0,80.0,3.50,kaithi,2019,Unknown,Unknown,Unknown,True,"A drug burst, an injured cop and a Convicted C...",drug burst injur cop convict crimin want meet ...
7810,tt9904802,0.0,0.0,1.0,78.0,1.0,80.0,2.75,enemy lines,2020,Unknown,Unknown,Unknown,False,"Is my first impression, its the lack of determ...",first impress lack determin determin whole plo...


### SAVE THE DATA

In [42]:
df_grouped_bert.to_csv("df_grouped_bert.csv", index=False)

In [43]:
df_grouped_self.to_csv("df_grouped_self.csv", index=False)

### ADDING MISSING DATA WITH BERT IMDB

In [53]:
# Load validation and test datasets
validation_df = pd.read_csv("Cleaned_CSVs/val_changed.csv")
# test_df = pd.read_csv("data/test_hidden.csv")

# validation_df.drop(columns=["Unnamed: 0"], inplace=True)
# test_df.drop(columns=["Unnamed: 0"], inplace=True)

# print(validation_df.head())

# # Identify missing columns
# missing_in_val = set(df_grouped_bert.columns) - set(validation_df.columns)
# missing_in_test = set(df_grouped_bert.columns) - set(test_df.columns)

# print(f"missing columns for validation: {missing_in_val}")
# print(f"missing columns for test: {missing_in_test}")

Unnamed: 0_level_0,count
tomatometer_status,Unnamed: 1_level_1
-1,955


In [50]:
validation_df = validation_df.merge(df_grouped_bert[["tconst"] + list(missing_in_val)], on="tconst", how="left")
test_df = test_df.merge(df_grouped_bert[["tconst"] + list(missing_in_test)], on="tconst", how="left")

print("Validation:\n", validation_df.head())
print()
print("Test:\n", test_df.head())

Validation:
       tconst               primaryTitle originalTitle startYear endYear  \
0  tt0003740                    Cabiria           NaN      1914      \N   
1  tt0008663            A Man There Was   Terje Vigen      1917      \N   
2  tt0010307                  J'accuse!           NaN      1919      \N   
3  tt0014429               Safety Last!  Safety Last!      1923      \N   
4  tt0015175  Die Nibelungen: Siegfried           NaN      1924      \N   

  runtimeMinutes  numVotes genre  tomatometer_rating  bert_sentiment  ...  \
0            148    3452.0   NaN                 NaN             NaN  ...   
1             65    1882.0   NaN                 NaN             NaN  ...   
2            166    1692.0   NaN                 NaN             NaN  ...   
3             74   19898.0   NaN                 NaN             NaN  ...   
4            143    5676.0   NaN                 NaN             NaN  ...   

   audience_rating  content_rating label like_count  tomatometer_status  

### ADDING MISSING DATA WITH SELF BERT