In [0]:
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 3.5MB/s 
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.2.1


In [0]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
import nltk
nltk.download('stopwords')
from sklearn.tree import DecisionTreeClassifier
from google.colab import files
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


test = pd.read_csv('gdrive/My Drive/yelp_test.csv')
train = pd.read_csv('gdrive/My Drive/yelp_train.csv')
sample = pd.read_csv('gdrive/My Drive/yelp_sample.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#All Data

In [0]:
#stop words are words such as is, the, as, and, for... that are commonly used in sentences
stop_words = set(stopwords.words('english'))


analyser = SentimentIntensityAnalyzer()

# turns user_elite into a number of years
def elite(i):
    if i == 'None':
        return 0
    else:
        return i.count(',') + 1

# turns the text into list of all words in text
def punc_split(i):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  my_str = i.lower()
  no_punct = ""
  for char in my_str:
    if char not in punctuations:
      no_punct = no_punct + char
  return no_punct.split()

# calculates average word length in the tet
def avg_word_len(i):
  return np.mean([len(w) for w in punc_split(i)])

#turns string of a date and returns the int of the year
def dateyear(i):
  return pd.to_datetime(i).year

#returns list of nonstop words
def remove_stops(i):
  word_list = punc_split(i)
  nonstop = []
  for word in word_list:
    if word not in stop_words:
      nonstop.append(word)
  return nonstop

#finds average length of nonstop words
def nonstop_word_len(i):
	return np.mean([len(w) for w in remove_stops(i)])

#sentiment analyzer score
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score

In [0]:
def process5sa(df):
    d = df[['cool', 'funny', 'useful', 'text', 'user_average_stars', 
            'user_elite', 'user_yelping_since', 'business_average_stars',
            'business_categories', 'business_review_count', 'text','business_city']]
    
    #average user and business stars
    avg_stars = np.asarray(df['user_average_stars'])
    b_avg_stars = np.asarray(df['business_average_stars'])
    
    #log of cool, useful, funny votes
    log_cool = np.asarray(np.log(df['cool'] + 1))
    log_useful = np.asarray(np.log(df['useful'] + 1))
    log_funny = np.asarray(np.log(df['funny'] + 1))
    
    #applies elite to user_elite
    years_elite = np.asarray(df['user_elite'].apply(elite))
    
    #turns yelping_since into a year
    year_since = np.asarray([pd.to_datetime(i).year for i in df['user_yelping_since']])
    
    #proportion of elite years from years yelping since
    proportion_elite = years_elite/(2019 - year_since)
    
    #length of each text
    text_len = np.asarray([len(i) for i in df['text']])
    
    #turns each text into a list of words in that text
    punc_spl = np.asarray([punc_split(i) for i in df['text']])
    
    #number of words in a text
    num_words = np.asarray([len(i) for i in punc_spl])
    
    #number of unique words in a text
    num_uwords = np.asarray([len(set(i)) for i in punc_spl])
    
    #proportion of unique words in a text
    proportion_unique = num_uwords/num_words
    
    #average word length in a text
    word_len = np.asarray([np.mean([len(i) for i in j]) for j in punc_spl])
    
    #sentiment analysis
    scores = [sentiment_analyzer_scores(i) for i in df['text']]
    compound_scores = [i['compound'] for i in scores]
    pos_scores = [i['pos'] for i in scores]
    neg_scores = [i['neg'] for i in scores]
    
    
    #dictionary ot data
    data = {'User stars': avg_stars,
            'Business avg stars' : b_avg_stars,
            'Cool' : log_cool,
            'Funny' : log_funny,
            'Useful' : log_useful,
            'Years elite' : years_elite,
            'Yelping since' : year_since,
            'Proportion elite' : proportion_elite,
            'Text length' : text_len,
            'Number of words' : num_words,
            'Unique words' : num_uwords,
            'Proportion of unique words' : proportion_unique,
            'Average word length' : word_len,
            'Compound senti' : compound_scores,
            'Pos senti': pos_scores,
            'Neg senti': neg_scores
           }
    
    #returns dataframe from the data above
    return pd.DataFrame(data)

In [0]:
def dtc(train, test, func):
  #creates a DTC object
  dtc66 = DecisionTreeClassifier(random_state=66, max_depth=12)
  
  #train and test data processed
  pro_train = func(train)
  pro_test = func(test)
  
  #fitting the DTC
  dtc66.fit(pro_train, train['is_good_rating'])
  
  #training score
  print(dtc66.score(pro_train, train['is_good_rating']))
  
  #array of predictions
  predictions = dtc66.predict(pro_test)
  return predictions

In [0]:
#turn predictions into downloaded file:
def submission(array, name):
  #creates a dataframe from the prediction array 
  prediction_df = sample.copy()
  prediction_df['is_good_rating'] = array
  new = prediction_df.set_index('review_id')
  #turns dataframe into csv file
  new.to_csv(name +'.csv')
  files.download(name + '.csv')

In [0]:
pred_final = dtc(train, test, process5sa)

In [0]:
submission(pred_final, 'final')

In [0]:
dtc66 = DecisionTreeClassifier(random_state=66, max_depth=12)
  
  #train and test data processed
pro_train = process5sa(train)
  #pro_test = func(test)
  
  #fitting the DTC
dtc66.fit(pro_train, train['is_good_rating'])
  
  #training score
print(dtc66.score(pro_train, train['is_good_rating']))
  
  #array of predictions
trainp = dtc66.predict(pro_train)
#submission(train, 'train_pred')

0.8719208333333334


In [0]:
def submission(array, name):
  #creates a dataframe from the prediction array 
  prediction_df = sample.copy()
  prediction_df['is_good_rating'] = array
  new = prediction_df.set_index('review_id')
  #turns dataframe into csv file
  new.to_csv(name +'.csv')
  files.download(name + '.csv')

In [0]:
trainp

array([0, 1, 1, ..., 1, 1, 0])

In [0]:
train['pred'] = trainp
train['pred'].to_frame().to_csv('train_pred_milan.csv')
files.download('train_pred_milan.csv')

In [0]:
train

Unnamed: 0,review_id,business_id,user_id,date,cool,funny,useful,text,user_average_stars,user_elite,user_review_count,user_yelping_since,business_categories,business_city,business_latitude,business_longitude,business_review_count,business_average_stars,business_state,is_good_rating,pred
0,9uu-JXdQuQ3rXOd1Vxv8kA,8zN3nV0zbtE377_XhlvabA,MWahC54Rd_CNv_0VtjCD6A,2017-06-27,0,0,0,"Pool tables, darts, and other neighborhood bar...",2.00,,5,2013-10-23,"Restaurants, Sports Bars, Bars, American (Trad...",Chandler,33.336050,-111.844343,158,4.0,AZ,0,0
1,UbCgugoTK2MfBmtU_8PXaQ,wJ-PcjemwR44WitIJsDzsA,N-GOto4b3ltfGJ7y-5g-pg,2017-03-25,0,0,0,"This place was alright. However, their classic...",4.43,,7,2016-07-04,"Beauty & Spas, Nail Salons",Fort Mill,35.058040,-80.937064,26,3.5,SC,0,1
2,6oF9_NR1O5Y_NOE6INXfrg,fP4sMlVw_MvyGNv4xeTGDA,RH3w77UyXaQPiOx2jOE7zg,2017-07-14,0,0,0,First time at Pieology Pizzeria and I was impr...,4.09,,21,2013-09-05,"Pizza, Restaurants",Las Vegas,36.114884,-115.155604,189,4.0,NV,1,1
3,kyYWrcrr83GnJ7m_VEvwCA,hgWMxKhrnOUd3m5nOUBIkA,g6pwf1E-CylnZQRoaZGGdw,2010-12-13,1,0,0,Its a good facility with lots of equipment and...,3.55,"2013, 2012",83,2010-09-03,"Active Life, Fitness & Instruction, Trainers, ...",Las Vegas,36.025332,-115.120852,316,3.0,NV,0,1
4,K2X3XG9_M3EM0TWguo2bCw,KMKPusWbBaIORB669W0EeQ,14nl002my6qLaEcqYHLPig,2016-09-12,1,1,1,Very very disappointed! My shawarma chicken wa...,3.75,,28,2016-03-24,"Mediterranean, Restaurants",Henderson,36.005901,-115.111863,61,3.5,NV,0,0
5,8BTtH9kpDTbU17j0qWX-Tw,JJcp5T5WQW3Z7zHPvsM9Yg,LK5fn6avSIfQneJgGXcKwQ,2017-09-13,0,0,0,"This is a first-class practice. First, the sta...",4.86,,5,2015-07-19,"Optometrists, Health & Medical",Chandler,33.291180,-111.803486,35,4.5,AZ,1,1
6,KTqCUw0I4esm4BHAPjBk7Q,qjnJFZtsY_nfRzoL3J_UWQ,YLgDEG9vFIiHo5e3K_g0hQ,2011-04-06,0,0,0,Rooms are too close to the roller coaster. If ...,3.40,,121,2011-03-22,"Hotels, Resorts, Casinos, Event Planning & Ser...",Las Vegas,36.101118,-115.173586,1831,3.5,NV,0,0
7,knaVMFxnKn86p1dW2QRfQg,769NudnrUxWFtJCGU66A_A,M_oqIYj88IOcGhaEcnWBzg,2012-04-25,1,0,2,"Since writing my first review, the Thompson Di...",3.00,,53,2011-09-21,"Breakfast & Brunch, Diners, American (New), Re...",Toronto,43.642914,-79.402046,227,3.0,ON,0,0
8,oSgFvpNrQcFZFCUlPlDL7A,czEmZSWIXuaBs0cTQNs6cQ,ANzR73RKDqk4w5jTCDWyug,2012-11-02,0,0,4,I have used this company now two times as I am...,4.02,,42,2011-10-22,"Pest Control, Local Services",North Las Vegas,36.248675,-115.153235,16,4.5,NV,1,1
9,_XUL-9rr6_GS4VXeL_jkjw,luxQ9RFLDM0ksWvlvVSUAA,VraV4Ci-oJsONsoIWCNeXA,2011-08-13,0,0,0,Including last night after they closed!!! thr...,3.88,,41,2011-05-07,"Automotive, Auto Parts & Supplies, Tires, Whee...",Chandler,33.305484,-111.870804,102,4.0,AZ,1,1


In [0]:
predf = train[['review_id', 'pred']]
predf.to_csv('milan.csv')
files.download('milan.csv')

In [0]:
sample

Unnamed: 0,review_id,is_good_rating
0,SiQGQ__poSeG1oKHVem30g,1
1,rFZPxXnZZuEfO5Cj8SprGw,1
2,DU9CeH0kUfN8LO7V02fxaA,1
3,dJgLik5M3vvHYHDkIW62qA,1
4,fi9Pbb8Z4bzzlHaxXGEJTg,1
5,uTqh1QYTsbZWQzEFN9GJpw,1
6,_3r-ySAFPD-khtsLnnXSvg,1
7,23WAFia8X-oKzJ3ITadiMQ,1
8,A2hyFWfQCN_un9RMNidi-g,1
9,POw-zNBEj28okTFmScK9lg,1
