## SENTIMENT ANALYSIS

#### IMPORTS

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from google.colab import drive
drive.mount('/content/drive')
import re
import json

In [None]:
# IMPORT THE ROBERTA MODEL FOR SENTIMENT ANALYSIS
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english",max_length=512, truncation=True)
print(sentiment_analysis("I love this!"))

In [None]:
# CHANGE THE FOLLOWING DIRECTORY IF YOU WANT TO REPRODUCE THE RESULTS
drive_directory = "/content/drive/MyDrive/INNOVATION-PROJECT/"

#### PROCESSING FUNCTIONS

In [None]:
def preprocess(x):

    """
    Inputs: x --> String to be cleaned
    Outputs: clean string

    """
    x = re.sub(r"\\n","",x)
    x = re.sub(r"\\t","",x)
    x = re.sub(r'"', "",x)
    #x = re.sub(r"https?:\/\/.*?[\s+]","",x) # remove URLs
    x = re.sub(r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})","",x) # Remove emails
    return x

In [None]:
def clean_df(df,date_column, rating_column, text_column):

    """
    Inputs: df --> pandas DataFrame to be cleaned
            date_column --> name of the column containing the data information
            rating_column --> name of the column containing the rating information
            text_column --> name of the column containing the reviews

    Outputs: clean dataframe

    """
  #clean text
  df[text_column] = df[text_column].fillna(0)
  df = df[df[text_column] != 0].reset_index(drop = True)
  df[text_column] = [preprocess(x) for x in df[text_column]]

  #clean the dates and put them in pd.date_time format
  clean_dates = []
  for date in df[date_column]:
    try:
      date = pd.to_datetime(date,format="%b %d %Y")
    except ValueError:
      date = pd.to_datetime(date)
    clean_dates.append(date)
  df[date_column] = clean_dates

  #clean the rating - store it as a float number
  df[rating_column] = df[rating_column].astype(str)
  df[rating_column] = [float(re.sub("\sstars","",x)) for x in df[rating_column]]

  return df

In [None]:
def create_sentiment(df,text_column,id_batch):

    """
    Inputs: df --> pandas DataFrame on which sentiment analysis is performed
            text_column --> name of the column containing the reviews
            id_batch --> Id of the batch in string format, e.g. "798"

    Outputs: list containing sentiment scores and labels for all reviews

    """

  batches = list(chunks(df[text_column].tolist(),500))

  for i in range(len(batches)): # perform sentiment analysis on small batches of 500 reviews at a time to save RAM, and save them
    scores = sentiment_analysis(batches[i])
    with open(drive_directory + f"scores{id_batch}_{i}.json","w") as f:
      json.dump(scores,f)

  sentiment = []
  for i in range(len(batches)): # read the saved batches and join them
    with open(drive_directory + f"scores{id_batch}_{i}.json","r") as file:
      sentiment += json.load(file)

  return sentiment

In [None]:
def add_sentiment(df,sentiment_list):

    """
    Inputs: df --> pandas DataFrame on which we want to add the sentiment scores
            sentiment_list --> list containing the sentiment score for each review


    Outputs: dataframe with new column contaning sentiment information

    """


  df = df.reset_index(drop = True)
  df["sentiment"] = sentiment_list
  df["sentiment_score"] = df["sentiment"].apply(lambda x: x.get("score"))
  df["sentiment_label"] = df["sentiment"].apply(lambda x: x.get("label"))
  df["sentiment_score"] = [df.sentiment_score[x] if df.sentiment_label[x] == "POSITIVE" else 1 - df.sentiment_score[x] for x in range(len(df))] # change the score to be close to 0 if negative
  df = df.reset_index(drop = True).drop("Unnamed: 0",axis = 1)

  return df

### FIRST BATCH

In [None]:
reviews300 = pd.read_csv(drive_directory + "reviews300.csv")
current_300 = clean_df(reviews300,"Date","Rating","Text")
sentiment_300 = create_sentiment(current_300,"Text","300")
final487 = add_sentiment(current_300, sentiment_300)
final487.to_csv(drive_directory + "reviews300-487clean.csv")

### SECOND BATCH

In [None]:
reviews487 = pd.read_csv(drive_directory + "reviews300.csv")
current_487 = clean_df(reviews300,"Date","Rating","Text")
current_487 = current487.loc[5431:,:].reset_index(drop = True)
sentiment_487 = create_sentiment(current_487,"Text","487")
final487 = add_sentiment(current_487, sentiment_487)
final487.to_csv(drive_directory + "reviews300-487clean.csv")

### THIRD BATCH

In [None]:
reviews798 = pd.read_csv(drive_directory + "reviews300.csv")
current_798 = clean_df(reviews300,"Date","Rating","Text")
current_index = 5431 + 3419
current_798 = current_df.loc[current_index:]
sentiment_798 = create_sentiment(current_798,"Text","798")
final798 = add_sentiment(current_798, sentiment_798)
final798.to_csv(drive_directory + "reviews487-798clean.csv")

### FOURTH BATCH

In [None]:
reviews1000 = pd.read_csv(drive_directory + "reviews1000.csv")
current_1000 = clean_df(reviews1000,"Date","Rating","Text")
sentiment_1000 = create_sentiment(current_1000, "Text",1000)
final1000 = add_sentiment(current_1000, sentiment_1000)
final1000.to_csv( drive_directory + "reviews798-1000clean.csv")

### FIFTH BATCH

In [None]:
reviews1250 = pd.read_csv(drive_directory + "reviews1250.csv")
reviews1250.Text = reviews1250.Text.fillna(0)
current_1250 = reviews1250[reviews1250.Text != 0].reset_index(drop = True).loc[3368:]
current_df = current_1250[current_1250.Date != "Date"]
current_1250 = clean_df(current_1250,"Date","Rating","Text")
sentiment_1250 = create_sentiment(current_1250, "Text","1250")
final1250 = add_sentiment(current_1250, sentiment_1250)
final1250.to_csv(drive_directory + "reviews1000-1250clean.csv")

### SIXTH BATCH - had some problems with duplicates in the previous batch, so we add an extra step to ensure to process only new data

In [None]:
reviews1466 = pd.read_csv( drive_directory + "reviews1466.csv")
reviews1466.Text = reviews1466.Text.fillna(0)
reviews1466 = reviews1466[reviews1466.Text != 0]

In [None]:
batch1 = pd.read_csv(drive_directory + "reviews300clean.csv")
batch2 = pd.read_csv(drive_directory + "reviews300-487clean.csv")
batch3 = pd.read_csv(drive_directory + "reviews487-798clean.csv")
batch4 = pd.read_csv(drive_directory + "reviews798-1000clean.csv")
batch5 = pd.read_csv(drive_directory + "reviews1000-1250clean.csv")

r1 = batch1["Review ID"].unique()
r2 = batch2["Review ID"].unique()
r3 = batch3["Review ID"].unique()
r4 = batch4["Review ID"].unique()
r5 = batch5["Review ID"].unique()

# TAKE ONLY REVIEWS THAT WERE NOT IN THE PREVIOUS BATCH
s = set().union(r1,r2,r3,r4,r5)
reviews1466 = reviews1466[~reviews1466["Review ID"].isin(s)]

In [None]:
current_1466 = clean_df(reviews1466,"Date","Rating","Text")
sentiment_1466 = create_sentiment(current_1466,"Text","1466")
final1466 = add_sentiment(current_1466, sentiment_1466)
final1466.to_csv(drive_directory + "reviews1250-1466clean.csv")