## Analisi e pulizia dei dati

In [2]:
import pandas as pd

In [40]:
reviews = pd.read_csv('./../archive/Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
reviews.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

### Nel dataset sono presenti dati nulli nelle colonne **ProfileName** e **Summary** che non intaccano lo svolgimento dei Jobs

In [42]:
reviews_job1 = reviews.copy(deep=True)
reviews_job2 = reviews.copy(deep=True)
reviews_job3 = reviews.copy(deep=True)

### Per il Job1 abbiamo bisogno delle colonne **ProductID**, **Time** e **Text**

In [8]:
reviews_job1.drop(['Id', 'UserId', 'Score', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Summary'], inplace = True, axis = 1)
reviews_job1.head()

Unnamed: 0,ProductId,Time,Text
0,B001E4KFG0,1303862400,I have bought several of the Vitality canned d...
1,B00813GRG4,1346976000,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,1219017600,This is a confection that has been around a fe...
3,B000UA0QIQ,1307923200,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,1350777600,Great taffy at a great price. There was a wid...


#### Il testo contiene punteggiatura e tag HTML che dovranno essere rimossi
#### La colonna time andrà convertita di formato per individuare l'anno di riferimento

In [14]:
reviews_job1.Text[30]

"I have never been a huge coffee fan. However, my mother purchased this little machine and talked me into trying the Latte Macciato. No Coffee Shop has a better one and I like most of the other products, too (as a usually non-coffee drinker!).<br />The little Dolche Guesto Machine is super easy to use and prepares a really good Coffee/Latte/Cappuccino/etc in less than a minute (if water is heated up). I would recommend the Dolce Gusto to anyone. Too good for the price and I'am getting one myself! :)"

##### Funzione per creare file di diverse dimensioni da utilizzare nei Job

In [23]:
dataset_sizes = [0.5, 1, 2, 5, 10]

def create_datasets(dataset, job):
    for size in dataset_sizes:
        n_rows = round(dataset.shape[0] * size)
        sampled_df = dataset.sample(n=n_rows, random_state=42, replace=True)
        filename = f'./input/reviews_{job}_dim_{str(size).replace(".", "")}.csv'
        sampled_df.to_csv(filename, header=None, index=None, sep='\t', mode='w')


In [24]:
create_datasets(reviews_job1, 'job1')

In [41]:
reviews_job1_10 = pd.read_csv('./input/reviews_job1_dim_10.csv', sep="\t", header=None)
reviews_job1_10.head()

Unnamed: 0,0,1,2
0,B003M63C0E,1338336000,"I have 5-7 dogs at any given time, sometimes f..."
1,B000CQIDHY,1297123200,I already liked regular Stash Earl Grey and so...
2,B001EQ55MM,1199577600,eight oclock makes great coffee and with balan...
3,B001PICX42,1309910400,I bought these for my kids but find myself eat...
4,B000B6MV9Q,1325203200,This is very good and just like the gourmet on...


### Per il Job2 abbiamo bisogno delle colonne **UserId**, **HelpfulnessDenominator** e **HelpfulnessNumerator**

In [44]:
reviews_job2.drop(['Id', 'ProfileName', 'ProductId', 'Score', 'Time', 'Summary', 'Text'], inplace = True, axis = 1)
reviews_job2.head()

Unnamed: 0,UserId,HelpfulnessNumerator,HelpfulnessDenominator
0,A3SGXH7AUHU8GW,1,1
1,A1D87F6ZCVE5NK,0,0
2,ABXLMWJIXXAIN,1,1
3,A395BORC6FGVXV,3,3
4,A1UQRSCLF8GW1T,0,0


In [34]:
create_datasets(reviews_job2, 'job2')

In [43]:
reviews_job2_05 = pd.read_csv('./input/reviews_job2_dim_05.csv', sep="\t", header=None)
reviews_job2_05

Unnamed: 0,0,1,2
0,A27L3LYLHCQZYG,2,2
1,A1AES697PC2IW5,0,0
2,A1Q99N7YEJ6CZJ,0,0
3,A3RJVINZDBOUNE,0,0
4,A2LN6GJQI1S9EW,0,0
...,...,...,...
284222,A9JLE9BISQFUB,0,0
284223,AH7B7I1EQ0386,0,1
284224,A3QU9R1IZY03ZR,2,6
284225,A1JEY42M785KI7,60,62


### Per il Job3 abbiamo bisogno delle colonne **UserId** e **Score**

In [36]:
reviews_job3.drop(['Id', 'ProfileName', 'ProductId', 'HelpfulnessDenominator', 'HelpfulnessNumerator', 'Time', 'Summary', 'Text'], inplace = True, axis = 1)
reviews_job3.head()

Unnamed: 0,UserId,Score
0,A3SGXH7AUHU8GW,5
1,A1D87F6ZCVE5NK,1
2,ABXLMWJIXXAIN,4
3,A395BORC6FGVXV,2
4,A1UQRSCLF8GW1T,5


In [37]:
create_datasets(reviews_job3, 'job3')

In [38]:
reviews_job3_1 = pd.read_csv('./input/reviews_job3_dim_1.csv', sep="\t", header=None)
reviews_job3_1

Unnamed: 0,0,1
0,A27L3LYLHCQZYG,5
1,A1AES697PC2IW5,5
2,A1Q99N7YEJ6CZJ,5
3,A3RJVINZDBOUNE,5
4,A2LN6GJQI1S9EW,4
...,...,...
568449,A3KJ9TZ2HLL7SA,5
568450,A54D9YSTLFQQE,5
568451,AWNRK5U5VSD0Q,5
568452,A3W1H0DC7KMRU6,4
