# Preprocessing

In [7]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [None]:
csv_path = os.path.join("Dataset", "Reviews.csv")
df = pd.read_csv(csv_path)

In [11]:
items = [",", ".", "!", "?", '"', "'"]

for item in items:
    df["Summary"] = df["Summary"].str.replace(item, '')
    df["Text"] = df["Text"].str.replace(item, '')
    df["ProfileName"] = df["ProfileName"].str.replace(item, '')

In [12]:
df["Time"] = df["Time"].apply(lambda time: datetime.fromtimestamp(time).year)

In [16]:
csv_path = os.path.join("Dataset", "Reviews-Parsed.csv")
df.to_csv(csv_path, index=False)

# Data Profiling

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [2]:
csv_path = os.path.join("Dataset", "Reviews-Parsed.csv")
df = pd.read_csv(csv_path)

In [3]:
df.dtypes

Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
dtype: object

In [4]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               64
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   98
Text                       0
dtype: int64

In [5]:
time_count = {}
for time in df["Time"]:
    if time not in time_count.keys():
        time_count[time] = 0
    time_count[time] += 1
time_count

{2011: 163299,
 2012: 198659,
 2008: 34163,
 2005: 1335,
 2010: 85884,
 2009: 55326,
 2006: 6671,
 2007: 22300,
 2004: 561,
 2000: 32,
 2003: 132,
 2002: 73,
 2001: 13,
 1999: 6}

# Analisi tempi di esecuzione

In [10]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [12]:
def plot_dataframe_time(df, title, base_path):
    
    plt.figure(figsize=(8, 6))
    
    plt.grid(True)
    
    for column in ["MapReduce", "Hive", "SparkCore", "SparkSQL"]:
        plt.plot(df["DimensioneInput"],df[column], label=column)
    
    
    plt.xlabel("Dimensione input")
    plt.ylabel("Secondi")
    plt.title(title)
    
    plt.legend()
    
    plt.savefig('plot.png')
    plt.show()

### Job2

In [16]:
csv_path = os.path.join("esercizio2", "tempi-locale.csv")
df = pd.read_csv(csv_path)
df

Unnamed: 0,DimensioneInput,MapReduce,Hive,SparkCore,SparkSQL
0,100000,18.67,50.104,19.175,55.139
1,200000,21.218,55.052,20.739,57.451
2,300000,22.522,63.451,22.72,60.731
3,400000,24.736,70.422,24.262,62.635
4,500000,28.138,79.06,29.09,69.864


In [17]:
plot_dataframe_time(df, "Tempi di esecuzione Job 2 - Locale", "esercizio2/")

TypeError: plot_dataframe_time() takes 2 positional arguments but 3 were given