In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize  
from sklearn.feature_extraction.text import TfidfVectorizer ,TfidfTransformer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold
import nltk
import math

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\SAITEJA-
[nltk_data]     WORKMACHINE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SAITEJA-
[nltk_data]     WORKMACHINE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\SAITEJA-
[nltk_data]     WORKMACHINE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
df = pd.read_csv("ios.txt")

In [24]:
df["content"] = df["title"] +" " + df["text"]


In [25]:
del df['appid']
del df['id']
del df['text']
del df['title']
del df['url']
del df['userUrl']
del df["userName"]
del df["version"]


In [26]:
df

Unnamed: 0,score,content
0,1,Absolutely terrible I am literally writing thi...
1,2,"Give the people what they want Soooo, no matte..."
2,1,"Please fix... Well did an update on Tuesday, &..."
3,1,"New Update is Trash, Useless (iOS) The new upd..."
4,3,Messenger may have a problem I don’t do review...
...,...,...
3896,5,Pink-a-licous While I love the app I do not en...
3897,3,Rating the app and clothes I’ve found the app ...
3898,3,"Issue I shop in PINK 24/7, but everytime I att..."
3899,3,Wishlist??? I am very confused on why it does ...


In [27]:
y = df["score"]
X = df.drop("score",axis =1,inplace=False)


In [28]:
X_train,X_test,y_train,y_test = train_test_split(X.index,y,test_size=0.2)
df_train=pd.concat([X,y],axis=1).iloc[X_train] # return dataframe train
df_test=pd.concat([X,y],axis=1).iloc[X_test] # return dataframe test
print(df_train.shape, df_test.shape)

(3120, 2) (781, 2)


In [29]:
df_test

Unnamed: 0,content,score
2513,Meh... This game is very satisfying and addict...,2
2709,Waste of money I am not a fan of fitness apps....,1
3288,Pls offer better iPad support! Overall great a...,5
2803,Lyft pink pass Please no one should fall for t...,2
582,Good but The app is good but I find it a littl...,4
...,...,...
35,Message Requests I have had no problem with me...,1
3450,"Childish Business for the Modern Era, pt. 1 Gi...",2
1640,Terrible app U-Haul’s app is garbage. When it ...,1
3647,"It’s ok for iOS, but much better for the deskt...",2


In [30]:
text_train=df_train["content"].values
text_test = df_test["content"].values
y_train= df_train["score"].values
y_test = df_test["score"].values

In [38]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [39]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [40]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [41]:
stemed_vec = StemmedCountVectorizer(min_df= 2,ngram_range=(1,2),tokenizer = LemmaTokenizer())

In [42]:
clf = Pipeline([("vec",stemed_vec),("tf-idf",TfidfTransformer()),
	            ("SVD",TruncatedSVD(n_components=3000)),("svr",SVR(C=10,gamma=1,verbose=1))])

In [43]:
param_grid = {'svr__C': [0.001, 0.01, 0.1, 1, 10], 'svr__gamma': [0.001, 0.01, 0.1, 1],
              'SVD__n_components':[1000,2000,3000,4000],'vec_ngram_range':[(1,1),(1,2),(1,3)],
              'tfidf__use_idf': (True, False)}

In [44]:
cv = KFold(shuffle=True)
grid = GridSearchCV(clf, param_grid=param_grid, cv=cv, verbose=3)

#### Fitting the model

In [45]:
clf.fit(text_train, y_train)

[LibSVM]

Pipeline(memory=None,
         steps=[('vec',
                 StemmedCountVectorizer(analyzer='word', binary=False,
                                        decode_error='strict',
                                        dtype=<class 'numpy.int64'>,
                                        encoding='utf-8', input='content',
                                        lowercase=True, max_df=1.0,
                                        max_features=None, min_df=2,
                                        ngram_range=(1, 2), preprocessor=None,
                                        stop_words=None, strip_accents=None,
                                        token_pattern='(?u)\\b\\w\\w+\\b',
                                        tokenizer=<__main__.Lem...
                                        vocabulary=None)),
                ('tf-idf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('SVD',

#### Prediction

In [58]:
y_pred = clf.predict(text_test)
#print(y_pred)

[ 2.84364079e+00  9.78434056e-01  4.43471047e+00  6.97649766e-01
  3.31274446e+00  2.42847162e+00  3.24918465e+00  3.28748261e+00
  2.98666276e+00  2.33053900e+00  2.84311150e+00  2.51005411e+00
  1.65681016e+00  3.57344332e+00  3.43825072e+00  4.07605843e+00
  5.18579340e+00  1.36471819e+00  3.71366379e+00  5.99107228e+00
  3.39579366e+00  2.74274731e+00  1.41222004e+00  2.37733360e+00
  9.92087094e-01  3.21376903e-01  5.86357403e+00  2.57788098e+00
  3.67609369e+00  1.22261796e+00  1.76515937e-02  5.70014943e+00
  2.26873661e+00  2.20555373e-01  3.52014231e+00  7.06278592e-01
  3.33240357e+00  2.99154556e+00  1.76723145e+00  3.61171912e-01
  2.16152786e+00  5.20392841e+00 -3.95614648e-01  4.08618340e+00
  1.43174809e-01  3.99312543e+00  7.95747211e-01  1.42417807e+00
  4.72979481e+00  2.40872483e+00  2.99197202e+00  2.23627086e+00
  8.69773293e-01  4.16364642e+00  2.74248944e+00  1.09614785e+00
  4.14235813e+00  2.29078194e-01  3.96397980e+00  6.53228194e+00
 -5.84329588e-01  1.59100

In [59]:
#Adjust the value to 1~5
for i in range(len(y_pred)):
	if (y_pred[i]>5) : y_pred[i]=5
	if (y_pred[i]<1) : y_pred[i]=1

In [67]:
y_testreal = df_test["score"].values
acc=0
for i in range(len(y_testreal)):
    if y_test[i]== math.ceil(y_pred[i]):
        acc=acc+1
    if i <18:
        print(y_test[i], round(y_pred[i]))
acc = acc/len(y_test)
print(acc)

2 3.0
1 1.0
5 4.0
2 1.0
4 3.0
3 2.0
5 3.0
5 3.0
4 3.0
2 2.0
5 3.0
5 3.0
1 2.0
5 4.0
5 3.0
3 4.0
4 5.0
3 1.0
0.4148527528809219


In [68]:
sample = "This app has so many updates constantly and you cant even figure out how to make the camera on this app work!! And no it's not my phone, when I go into my camera everything is crystal clear but the minute I go on your app to take a picture it's nothing but blur even when you click on what you want it to focus on... Fix the app with your next update instead of making it worst"

In [69]:
 (clf.predict([sample]))[0]

0.9583481638745277