In [None]:
import numpy as np
import pandas as pd
import json
import gzip
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')
import torch
import re
import nltk
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer as st
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
def json2df(path):
    data=[json.loads(i) for i in open(path, 'r')]
    df=pd.DataFrame.from_records(data)
    return(df)

In [3]:
Digital_Music = json2df('Digital_Music.json')
meta_Digital_Music = json2df('meta_Digital_Music.json')
Musical_Instruments = json2df('Musical_Instruments.json')
meta_Musical_Instruments = json2df('meta_Musical_Instruments.json')

In [4]:
Digital_Music['overall_split']=np.nan
Digital_Music['overall_split'].loc[Digital_Music['overall']>3.0]=0
Digital_Music['overall_split'].loc[Digital_Music['overall']<=3.0]=1

In [5]:
Digital_Music['overall_split'].value_counts()

0.0    1460076
1.0     124006
Name: overall_split, dtype: int64

In [6]:
Digital_Music1=Digital_Music.groupby('overall_split',group_keys=False).apply(lambda x: x.sample(120000))

In [7]:
Musical_Instruments['overall_split']=np.nan
Musical_Instruments['overall_split'].loc[Musical_Instruments['overall']>3.0]=0
Musical_Instruments['overall_split'].loc[Musical_Instruments['overall']<=3.0]=1

In [8]:
Musical_Instruments['overall_split'].value_counts()

0.0    1221392
1.0     291138
Name: overall_split, dtype: int64

In [9]:
Musical_Instruments1=Musical_Instruments.groupby('overall_split',group_keys=False).apply(lambda x: x.sample(120000))

In [10]:
Digital_Music1.to_csv('Digital_Music_sample.csv',index=False)
Musical_Instruments1.to_csv('Musical_Instruments_sample.csv',index=False)
meta_Digital_Music.to_csv('Digital_Music_meta.csv',index=False)
meta_Musical_Instruments.to_csv('Musical_Instruments_meta.csv',index=False)
Digital_Music.to_csv('Digital_Music_data.csv',index=False)
Musical_Instruments.to_csv('Musical_Instruments_data.csv',index=False)

In [114]:
music_data=pd.read_csv('Digital_Music_sample.csv')
instrument_data=pd.read_csv('Musical_Instruments_sample.csv')

## Data Exploration

### Music Data

In [115]:
music_data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,overall_split
0,5.0,True,"03 14, 2013",A3UC8L25PE4DRQ,B000W18IN8,{'Format:': ' MP3 Music'},Heather Mitchell,"This song just makes me happy, I love the beat...",Happy,1363219200,,,0.0
1,4.0,True,"07 14, 2014",A4C8P8UILDU7Z,B00136JA8S,{'Format:': ' MP3 Music'},Crazy Cat Lady,Love this song. It's catchy and you can dance ...,Four Stars,1405296000,,,0.0
2,5.0,True,"06 23, 2013",AHK0R8V0ZL4KL,B003A4QRU0,{'Format:': ' MP3 Music'},Ellen M. Viars,i was so happy to learn that i could download ...,neverr my love,1371945600,,,0.0
3,5.0,False,"04 2, 2005",A1EH6PWJVKD9VH,B000QZX5B0,{'Format:': ' Audio CD'},Jon Hartley,This album in my opinion kicks just as much as...,Awsome album,1112400000,,,0.0
4,5.0,True,"01 8, 2015",A3EWYJXU7V4VA0,B0048IMZTE,{'Format:': ' MP3 Music'},Kasey M. Webb,sweet song :),Five Stars,1420675200,,,0.0


In [116]:
music_data.shape

(240000, 13)

In [117]:
music_data.dtypes

overall           float64
verified             bool
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
vote              float64
image              object
overall_split     float64
dtype: object

In [118]:
def unix2time(df,columnname):
    s=[]
    for i in np.array(df[columnname]):
        ts = int(i)
        k=datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d')
        s.append(k)
    return(s)

In [119]:
music_data['date']=unix2time(music_data,'unixReviewTime')

In [120]:
music_data['date']=pd.to_datetime(music_data['date'])

In [121]:
music_data['overall']=music_data['overall'].astype('int')

In [122]:
music_data.isnull().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style              43321
reviewerName          16
reviewText           153
summary               74
unixReviewTime         0
vote              211417
image             238923
overall_split          0
date                   0
dtype: int64

In [123]:
music_data.drop(['vote','reviewTime','image','style','reviewerName'],axis=1,inplace=True)
music_data.dropna(inplace=True)

In [124]:
music_data.isnull().sum()

overall           0
verified          0
reviewerID        0
asin              0
reviewText        0
summary           0
unixReviewTime    0
overall_split     0
date              0
dtype: int64

In [125]:
music_data.shape

(239783, 9)

In [126]:
music_data.describe()

Unnamed: 0,overall,unixReviewTime,overall_split
count,239783.0,239783.0,239783.0
mean,3.495031,1402848000.0,0.500152
std,1.535608,85659760.0,0.500001
min,1.0,880675200.0,0.0
25%,2.0,1369181000.0,0.0
50%,3.0,1418170000.0,1.0
75%,5.0,1456963000.0,1.0
max,5.0,1538438000.0,1.0


In [127]:
music_data.dtypes

overall                    int32
verified                    bool
reviewerID                object
asin                      object
reviewText                object
summary                   object
unixReviewTime             int64
overall_split            float64
date              datetime64[ns]
dtype: object

In [128]:
music_data['text']=music_data['reviewText'].astype(str)+' ' + music_data['summary'].astype(str)

In [129]:
music_data.drop(columns=['reviewText','summary'],inplace=True)

In [130]:
def clean_text(text):
    # Remove all non-letters and non-spaces except for hyphens and digits
    text = re.sub("[^0-9A-Za-z\- ]+", " ", text)
    # Remove all numbers except those attached to a word
    text = re.sub("(?<!\w)\d+", "", text)
    # Remove all hyphens except between two words
    text = re.sub("-(?!\w)|(?<!\w)-", "", text)
    # Remove multiple spaces and lowercase everything
    text = " ".join(text.split())
    text = text.lower()
    return text

In [131]:
music_data['text'] = music_data['text'].apply(clean_text)

In [132]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [133]:
stop = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours','yourself','yourselves','he',
'him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom',
'this','that',"that'll",'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did',
'doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into',
'through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once',
'here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','only','own',
'same','so','than','too','very','s','t','can','will','just','don','should',"should've",'now','d','ll','m','o','re','ve','y','ain',
'aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",
'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn','wasn',"wasn't",'weren',"weren't",'won',"won't",
'wouldn',"wouldn't"]

In [134]:
lemmatizer = WordNetLemmatizer()

music_data['text_token'] = music_data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop]))

In [135]:
music_data['text_token']=music_data['text_token'].apply(word_tokenize)

In [136]:
music_data.head()

Unnamed: 0,overall,verified,reviewerID,asin,unixReviewTime,overall_split,date,text,text_token
0,5,True,A3UC8L25PE4DRQ,B000W18IN8,1363219200,0.0,2013-03-14,this song just makes me happy i love the beat ...,"[song, make, happy, love, beat, lyric, bad, mo..."
1,4,True,A4C8P8UILDU7Z,B00136JA8S,1405296000,0.0,2014-07-14,love this song it s catchy and you can dance t...,"[love, song, catchy, dance, four, star]"
2,5,True,AHK0R8V0ZL4KL,B003A4QRU0,1371945600,0.0,2013-06-23,i was so happy to learn that i could download ...,"[happy, learn, could, download, music, song, l..."
3,5,False,A1EH6PWJVKD9VH,B000QZX5B0,1112400000,0.0,2005-04-02,this album in my opinion kicks just as much as...,"[album, opinion, kick, much, as, every, one, s..."
4,5,True,A3EWYJXU7V4VA0,B0048IMZTE,1420675200,0.0,2015-01-08,sweet song five stars,"[sweet, song, five, star]"


In [156]:
c=b-a
c

2355.676726102829

In [41]:
music_vectorized=pd.read_csv('music_vectorized.csv')
music_vectorized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.006341,-0.048968,0.050782,0.00477,0.014288,-0.054868,-0.042598,-0.005151,-0.00699,0.062035,...,-0.004214,0.015523,0.008789,0.04498,-0.013625,-0.009514,0.09959,-0.044852,-0.035846,0.017131
1,-0.016574,-0.014582,0.004706,-0.057177,0.02748,-0.067841,-0.070344,0.090828,0.025527,-0.043992,...,-0.005843,0.024512,0.043574,0.020036,-0.071452,0.008643,0.052097,-0.027362,-0.052158,-0.024019
2,-0.002461,-0.013773,0.038159,-0.048271,0.024668,0.006964,-0.033701,0.009722,0.016837,0.047756,...,-0.031977,0.014965,-0.01151,0.02367,-0.033534,0.071706,0.086084,-0.030905,-0.061779,0.029862
3,0.019229,0.058249,-0.015583,-0.021432,0.017944,-0.017404,0.026222,0.032445,-0.006221,-0.026489,...,-0.011361,0.005258,-0.0002,-0.007923,-0.021057,0.065149,0.088129,-0.031753,-0.027974,0.036131
4,0.04671,0.056445,0.059222,0.019981,-0.006979,0.000911,0.029603,-0.046746,-0.007103,-0.019995,...,0.007344,0.010331,-0.018815,0.048318,-0.141372,0.041775,-0.005285,-0.059265,0.020586,-0.043401


### Instrument Data

In [137]:
instrument_data.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,overall_split
0,5.0,,True,"01 23, 2017",AO3S8ECI9UKNR,B000EELFTW,,M &amp;amp; J P,Works well/good quality,Recommend,1485129600,,0.0
1,5.0,10.0,True,"03 21, 2017",A19OPCXN2BSMNB,B003YOT65E,"{'Color:': ' Black', 'Style:': ' Guitar Only'}",Jenn,I don't expect much from a $35 guitar. I wante...,Best value at this price point,1490054400,,0.0
2,5.0,,True,"02 21, 2014",A19OG7CX70QMMX,B004Z8ORXO,{'Color:': ' Bright Red'},Jody Cranford,"This is like a grown up drum kit, that's gone ...",High quality drum kit for anyone!! Well small...,1392940800,,0.0
3,5.0,,True,"11 30, 2014",AZILBE3JAU7ND,B004Z17008,{'Size:': ' SN6'},BDFU,Best tuner on the market. I own several of the...,Best tuner on the market,1417305600,,0.0
4,5.0,,True,"01 4, 2015",A3BRP3OO1911O2,B000189YD0,{'Color:': ' Liquid Black'},Laura,This was for my 13 year old son as a beginner ...,Excellent choice for a beginner bass player,1420329600,,0.0


In [138]:
instrument_data.dtypes

overall           float64
vote               object
verified             bool
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
image              object
overall_split     float64
dtype: object

In [139]:
instrument_data['date']=unix2time(instrument_data,'unixReviewTime')

In [140]:
instrument_data['date']=pd.to_datetime(instrument_data['date'])

In [141]:
instrument_data['overall']=instrument_data['overall'].astype('int')

In [142]:
instrument_data.shape

(240000, 14)

In [143]:
instrument_data.isnull().sum()

overall                0
vote              196251
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             136305
reviewerName          37
reviewText            86
summary               52
unixReviewTime         0
image             234578
overall_split          0
date                   0
dtype: int64

In [144]:
instrument_data.drop(['vote','image','reviewTime','style','reviewerName'],axis=1,inplace=True)
instrument_data.dropna(inplace=True)

In [145]:
instrument_data.isnull().sum()

overall           0
verified          0
reviewerID        0
asin              0
reviewText        0
summary           0
unixReviewTime    0
overall_split     0
date              0
dtype: int64

In [146]:
instrument_data.shape

(239865, 9)

In [147]:
instrument_data['text']=instrument_data['reviewText'].astype(str)+' ' + instrument_data['summary'].astype(str)

In [148]:
instrument_data.drop(columns=['reviewText','summary'],inplace=True)

In [149]:
instrument_data['text'] = instrument_data['text'].apply(clean_text)

In [150]:
instrument_data['text_token'] = instrument_data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop]))

In [151]:
instrument_data['text_token']=instrument_data['text_token'].apply(word_tokenize)

In [152]:
instrument_data.head()

Unnamed: 0,overall,verified,reviewerID,asin,unixReviewTime,overall_split,date,text,text_token
0,5,True,AO3S8ECI9UKNR,B000EELFTW,1485129600,0.0,2017-01-23,works well good quality recommend,"[work, well, good, quality, recommend]"
1,5,True,A19OPCXN2BSMNB,B003YOT65E,1490054400,0.0,2017-03-21,i don t expect much from a guitar i wanted a c...,"[expect, much, guitar, wanted, cheap, knock-ar..."
2,5,True,A19OG7CX70QMMX,B004Z8ORXO,1392940800,0.0,2014-02-21,this is like a grown up drum kit that s gone t...,"[like, grown, drum, kit, gone, thru, wonkavisi..."
3,5,True,AZILBE3JAU7ND,B004Z17008,1417305600,0.0,2014-11-30,best tuner on the market i own several of thes...,"[best, tuner, market, several, go, several, uk..."
4,5,True,A3BRP3OO1911O2,B000189YD0,1420329600,0.0,2015-01-04,this was for my year old son as a beginner gui...,"[year, old, son, beginner, guitar, gotten, adv..."


In [63]:
pd.read_csv('instrument_vectorized.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.034814,0.064835,-0.033559,-0.030410,0.020887,-0.010554,0.003480,0.057081,0.096977,-0.022064,...,0.056040,-0.004366,0.013781,0.041679,0.040939,-0.053549,0.035697,-0.070312,0.016141,0.002079
1,0.031321,-0.031812,0.014827,-0.005265,-0.028531,0.066086,0.000113,0.060869,0.035434,0.008755,...,0.011690,0.013695,-0.044613,0.001436,-0.059198,-0.003137,-0.003530,-0.038577,-0.048716,-0.017238
2,0.013715,0.068224,-0.021230,0.033625,-0.003060,0.009364,0.033159,-0.030048,0.036734,-0.012419,...,0.000962,-0.030951,-0.001161,0.089217,0.038845,-0.030696,-0.030912,-0.051987,-0.009133,0.005690
3,0.004411,0.038235,-0.027332,-0.002045,0.003019,-0.021669,-0.000427,0.009215,0.035430,-0.028581,...,0.001876,0.053622,-0.051027,-0.024830,-0.004884,-0.025674,0.073094,0.002586,0.010627,-0.061680
4,0.038393,0.013537,-0.002695,-0.085531,0.006316,0.006316,-0.005079,0.004417,0.065546,-0.010181,...,-0.051903,0.041929,-0.064161,0.033739,-0.014316,0.042210,0.074691,-0.036988,0.013815,-0.021834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,-0.003175,0.008541,0.012754,-0.078046,0.034444,-0.046241,-0.001078,0.013823,0.041080,0.004543,...,0.002137,0.043647,0.019105,0.068905,0.064226,-0.060153,-0.005863,-0.001481,0.013904,-0.005184
39996,-0.000918,0.010933,-0.006332,-0.031070,-0.029590,-0.013627,-0.009750,-0.005595,0.047023,0.010987,...,0.039936,0.022022,-0.005173,0.025662,-0.012191,-0.023061,-0.077481,-0.034152,-0.033166,-0.022671
39997,-0.002520,0.035724,-0.009397,-0.003433,0.024485,0.045265,-0.017147,0.014443,0.064398,-0.006683,...,0.100031,0.024986,-0.047099,0.088739,0.026823,-0.034279,0.045044,-0.057575,0.030394,0.005528
39998,-0.021202,-0.007869,-0.021757,-0.035217,0.063170,-0.053930,-0.003447,-0.005448,0.042984,-0.022741,...,0.003773,0.002308,-0.068476,0.026154,0.032643,-0.007690,0.050376,-0.034006,0.053289,-0.041376


## Sentiment Analysis

In [64]:
def vader_sentiment_analyzer(df):
    d=time.time()
    sentiments = SentimentIntensityAnalyzer()
    df["positive"] = [sentiments.polarity_scores(i)["pos"] for i in df["text"]]
    df["negative"] = [sentiments.polarity_scores(i)["neg"] for i in df["text"]]
    df["neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df["text"]]
    df["compound"] = [sentiments.polarity_scores(i)["compound"] for i in df["text"]]
    df['vader_sentiment']=range(len(df))
    df['vader_sentiment'].loc[df['compound'] >=0.5]='Positive'
    df['vader_sentiment'].loc[(df['compound'] >= 0) & (df['compound'] < 0.5)]='Neutral'
    df['vader_sentiment'].loc[(df['compound'] < 0) & (df['compound'] > -0.5)]='Conflict'
    df['vader_sentiment'].loc[df['compound'] <=-0.5]='Negative'
    e=time.time()
    f=d-e
    return(df,f)

In [65]:
music_data,vader1time=vader_sentiment_analyzer(music_data)

In [66]:
music_data.head()

Unnamed: 0,overall,asin,unixReviewTime,overall_split,date,text,text_token,positive,negative,neutral,compound,vader_sentiment
0,5,B000W18IN8,1363219200,0.0,2013-03-14,this song just makes me happy i love the beat ...,"[song, make, happy, love, beat, lyric, bad, mo...",0.338,0.056,0.606,0.9565,Positive
1,4,B00136JA8S,1405296000,0.0,2014-07-14,love this song it s catchy and you can dance t...,"[love, song, catchy, dance, four, star]",0.259,0.0,0.741,0.6369,Positive
2,5,B003A4QRU0,1371945600,0.0,2013-06-23,i was so happy to learn that i could download ...,"[happy, learn, could, download, music, song, l...",0.497,0.0,0.503,0.9501,Positive
3,5,B000QZX5B0,1112400000,0.0,2005-04-02,this album in my opinion kicks just as much as...,"[album, opinion, kick, much, as, every, one, s...",0.201,0.111,0.688,0.6582,Positive
4,5,B0048IMZTE,1420675200,0.0,2015-01-08,sweet song five stars,"[sweet, song, five, star]",0.5,0.0,0.5,0.4588,Neutral


In [67]:
music_data.dtypes

overall                     int32
asin                       object
unixReviewTime              int64
overall_split             float64
date               datetime64[ns]
text                       object
text_token                 object
positive                  float64
negative                  float64
neutral                   float64
compound                  float64
vader_sentiment            object
dtype: object

In [68]:
vader1time

-449.3929753303528

In [69]:
instrument_data,vader2time=vader_sentiment_analyzer(instrument_data)

In [70]:
instrument_data

Unnamed: 0,overall,asin,unixReviewTime,overall_split,date,text,text_token,positive,negative,neutral,compound,vader_sentiment
0,5,B000EELFTW,1485129600,0.0,2017-01-23,works well good quality recommend,"[work, well, good, quality, recommend]",0.789,0.000,0.211,0.7579,Positive
1,5,B003YOT65E,1490054400,0.0,2017-03-21,i don t expect much from a guitar i wanted a c...,"[expect, much, guitar, wanted, cheap, knock-ar...",0.250,0.045,0.704,0.9929,Positive
2,5,B004Z8ORXO,1392940800,0.0,2014-02-21,this is like a grown up drum kit that s gone t...,"[like, grown, drum, kit, gone, thru, wonkavisi...",0.248,0.000,0.752,0.9042,Positive
3,5,B004Z17008,1417305600,0.0,2014-11-30,best tuner on the market i own several of thes...,"[best, tuner, market, several, go, several, uk...",0.331,0.000,0.669,0.8555,Positive
4,5,B000189YD0,1420329600,0.0,2015-01-04,this was for my year old son as a beginner gui...,"[year, old, son, beginner, guitar, gotten, adv...",0.135,0.036,0.830,0.9427,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...
239995,2,B00CBPAPZW,1473638400,1.0,2016-09-12,unstable flips over easily lens doesn t rotate...,"[unstable, flip, easily, lens, rotate, look, l...",0.255,0.193,0.552,0.1181,Neutral
239996,1,B00GEV6P8U,1451606400,1.0,2016-01-01,very very very disappointed with this product ...,"[disappointed, product, first, two, purchased,...",0.074,0.224,0.702,-0.9851,Negative
239997,2,B0002OSBDM,1365984000,1.0,2013-04-15,i purchased this because i thought it would ma...,"[purchased, thought, would, make, transition, ...",0.096,0.015,0.889,0.8990,Positive
239998,2,B005HJAH2A,1435363200,1.0,2015-06-27,i have now had three of these exact same carts...,"[three, exact, cart, first, one, lasted, moder...",0.070,0.056,0.874,0.4366,Neutral


In [71]:
vader2time

-497.9858446121216

In [157]:
m1,vader3time=vader_sentiment_analyzer(m1)
i1,vader4time=vader_sentiment_analyzer(i1)

In [92]:
sentiments = SentimentIntensityAnalyzer()
sent='''this guitar is good for a year old it s a beautiful guitar but a 
small size guitar the scale length of an average guitar should be around and this only has a scale length that 
makes a difference if you are looking to fit a year old other than the scale length i would say this is a good guitar 
so if you are looking for a young child and want a step up from a toy then this would be your choice small size guitar'''
sentiments.polarity_scores(sent)["compound"] 

0.899

On the above review the person is not fully satisfied with the product. He has mentioned that the guitar size is small and it won't suite adults but sentiment intensity analyzer has given a positive sentiment i.e score is greater than 0.5. Thereby we can implement this on bert.

In [74]:
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cpu')

In [75]:
def bert_sentiment_analyzer(text):
    d=time.time()
    analyzer = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
    bert=analyzer(text,truncation=True)
    lst=[]
    for i in bert:
        lst.append(i['label'])
    e=time.time()
    f=e-d
    return(lst,f)

In [93]:
sentiment,bert_time=bert_sentiment_analyzer('''this guitar is good for a year old it s a beautiful guitar but a 
small size guitar the scale length of an average guitar should be around and this only has a scale length that 
makes a difference if you are looking to fit a year old other than the scale length i would say this is a good guitar 
so if you are looking for a young child and want a step up from a toy then this would be your choice small size guitar''')

In [94]:
sentiment

['3 stars']

In [95]:
bert_time

3.9807558059692383

In bert it has given 3 stars which means this review is not positive but it's an average. We can see the difference between the sentiment intensity analyzer and bert through this comparison but due to large computation time it is hard to implement bert without GPU ot TPU. So we are considering the sentiment's given by sentiment intensity analyzer.

In [159]:
music_data.to_csv('Music_sentiment_data.csv',index=False)
instrument_data.to_csv('Instrument_sentiment_data.csv',index=False)

In [158]:
m1.to_csv('Music_ml_data.csv',index=False)
i1.to_csv('Instrument_ml_data.csv',index=False)

In [106]:
tokenizer=AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [107]:
model=AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [110]:
sd=time.time()
tokens=tokenizer.encode('''this guitar is good for a year old it s a beautiful guitar but a 
small size guitar the scale length of an average guitar should be around and this only has a scale length that 
makes a difference if you are looking to fit a year old other than the scale length i would say this is a good guitar 
so if you are looking for a young child and want a step up from a toy then this would be your choice small size guitar''',return_tensors='pt')
result=model(tokens)

sw=time.time()

In [111]:
sw-sd

0.41034603118896484

In [None]:
print("This sentence gets", int(torch.argmax(result.logits))+1, "stars !")
