In [254]:
import pandas as pd 
import numpy as np

corpus = pd.read_excel("FBData.xlsx", sheetname = "Data")


In [255]:
# corpus = corpus[corpus['web'].isin(['TmobileCz','o2cz','vodafoneCZ'])]
corpus = corpus[corpus['GOLD'].isin(['p','n',0])]


In [256]:
## Split randomly in training and testing
train = corpus.sample(frac=0.8)
test = corpus.loc[~corpus.index.isin(train.index)]

In [257]:
##########################################
## Feature extraction 
##########################################

from sklearn.feature_extraction.text import CountVectorizer

# Import custom stop words in Czech

cz_sw = pd.read_csv('cz_stop_words.txt', names = ['word'])

# my_stop_words = list(cz_sw['word']) tohle nefunguje, asi nemam sloupec word?
my_stop_words = list(cz_sw['word']) + (["bych", "taky", "jo", "no", "den", "tam", "sem", "něco"])

count_vect = CountVectorizer(analyzer='word', stop_words=my_stop_words)

#count_vect = CountVectorizer(analyzer="word")

In [258]:
# cz_sw = pd.read_csv('cz_stop_words.txt')
cz_sw.head(10)

Unnamed: 0,word
0,a
1,a sice
2,a to
3,aby
4,aj
5,ale
6,ani
7,aniz
8,aniž
9,ano


In [259]:
#  Different methods of counting word frequency
X_train_counts = count_vect.fit_transform(train['Text'])


In [260]:
## Label encoder
from sklearn.preprocessing import LabelEncoder

In [261]:
le  = LabelEncoder()
y_train = le.fit_transform(train['GOLD'].astype(str))

In [262]:

##########################################
## Training the model
##########################################


from sklearn.naive_bayes import MultinomialNB

In [263]:
clf = MultinomialNB().fit(X_train_counts, y_train)

In [264]:
# Convert to matrix form the test data
X_test = count_vect.transform(test['Text'])
y_test = le.transform(test['GOLD'].astype(str))

In [265]:
y_preds = clf.predict(X_test)

In [266]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

             precision    recall  f1-score   support

          0       0.68      0.87      0.76      1026
          1       0.62      0.28      0.38       416
          2       0.76      0.67      0.71       508

avg / total       0.69      0.69      0.67      1950



In [267]:
##########################################
## Interpreting model results
##########################################

def print_topn(vect, clf, class_labels, n=10):
    feature_names = vect.get_feature_names()
    for i, class_label in enumerate(class_labels):
        topn= np.argsort(clf.coef_[i])[-n:]
        print("%s: %s "% (class_label,
              " ".join(feature_names[j] for j in topn)))

In [268]:
# Show important words
print_topn(count_vect, clf, le.classes_, n=20)

0: třeba sms ti nevím tady kolik odpověď telefon chci moc o2 jestli někdo zeptat kdy díky prosím děkuji mám dobrý 
n: vás stále nikdy nikdo mobile vůbec fakt pořád vodafone nefunguje ti teda zase teď někdo nejde tady mám o2 moc 
p: miluju krásná krásný zoo líbí fakt děkuju dobře hned hodně přeji krásné opravdu vůně nejlepší díky mám super děkuji moc 


In [245]:
##########################################################################
## YOUR TURN:
## 0. Filter out some words that make no sense (for instance "bych" above)    
## 1. Read your table 
## 2. Apply count_vect.transform to the column that has the comments
## 3. Score the model. You can use clf.predict on the vectorized data frame
## 4. Save the dataframe and load it back into Keboola. 
##      The resulting dataframe should have columns 
##      operator|message|sentiment.
##    To transform back the labels of the model into our labels, 
##    you can use le.inverse_transform    
##########################################################################

In [271]:
o2 = pd.read_csv("Czechitas/commentsO2_clean.csv", encoding = "UTF-8", sep=';')
o2['message'] = o2['message'].fillna('a')
X_o2 = count_vect.transform(o2['message'])
o2_preds = clf.predict(X_o2)
o2['prediction'] = le.inverse_transform(o2_preds)
o2_final = pd.DataFrame(o2, columns =['operator', 'created_time', 'message', 'likes_count', 'comments_count','prediction'])
o2_final['operator'] = 'o2'


In [272]:
tm = pd.read_csv("Czechitas/commentsTM_clean.csv", encoding = "UTF-8", sep=';')
tm['message'] = tm['message'].fillna('a')
X_tm = count_vect.transform(tm['message'])
tm_preds = clf.predict(X_tm)
tm['prediction'] = le.inverse_transform(tm_preds)
tm_final = pd.DataFrame(tm, columns =['operator', 'created_time', 'message', 'likes_count', 'comments_count','prediction'])
tm_final['operator'] = 'tm'

In [273]:
vf = pd.read_csv("Czechitas/commentsVF_clean.csv", encoding = "UTF-8", sep=';')
vf['message'] = vf['message'].fillna('a')
X_vf = count_vect.transform(vf['message'])
vf_preds = clf.predict(X_vf)
vf['prediction'] = le.inverse_transform(vf_preds)
vf_final = pd.DataFrame(vf, columns =['operator', 'created_time', 'message', 'likes_count', 'comments_count','prediction'])
vf_final['operator'] = 'vf'

In [280]:
dfUnionAll = pd.concat([o2_final, tm_final, vf_final], ignore_index=True)
dfUnionAll.head()

Unnamed: 0,operator,created_time,message,likes_count,comments_count,prediction
0,o2,2014-01-13T16:06:49+0000,FREE tarif bez internetu by se mi líbil daleko...,5,0,0
1,o2,2014-01-13T16:08:19+0000,"alespon neco, když už je to tak drahý... :D",4,0,n
2,o2,2014-01-13T16:08:32+0000,"Ehh to je fakt odvaz 1,5GB v dobe fullhd YouTu...",14,0,0
3,o2,2014-01-13T16:09:41+0000,hm ješte kdyby to bylo tak za 400kc mesícne kd...,5,0,0
4,o2,2014-01-13T16:12:20+0000,Lepší jak nic :) Dekuji za 0.5GB navíc :),5,0,0


In [281]:
sentiment = pd.concat([o2_final, tm_final, vf_final], ignore_index=True)

sentiment.to_csv('sentiment.csv', encoding='utf-8', sep=";")

In [282]:
sentiment['id'] = range(1, len(sentiment) + 1)
sentiment.to_csv('sentiment2.csv', encoding='utf-8', sep=";")

In [250]:
# o2.to_csv('o2_sentiment5.csv', encoding='utf-8', sep=";")


In [63]:
# prevest na normalni datum
import datetime, dateutil.parser

d = dateutil.parser.parse('2008-09-26T01:51:42.000Z')
print d.strftime('%m/%d/%Y') #==> '09/26/2008'


In [67]:
o2['predictions'] = o2_preds

import xlsxwriter
writer = pd.ExcelWriter('o2_comments.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
o2.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()


  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))


NameError: name 'tm_comments' is not defined

IndexError: invalid index

In [78]:
##########################################
## Feature extraction 
##########################################

from sklearn.feature_extraction.text import CountVectorizer

# Import custom stop words in Czech

cz_sw = pd.read_csv('cz_stop_words.txt', names = ['word'])

# my_stop_words = list(cz_sw['word']) tohle nefunguje, asi nemam sloupec word?
my_stop_words = list(cz_sw['word']) + (['tam', "den", "mám", "mam", "jo", "bych", "no", "vůně", "zoo", "zase", "sem"])

count_vect = CountVectorizer(analyzer='word', stop_words=my_stop_words)

#count_vect = CountVectorizer(analyzer="word")

# yaloha

NameError: name 'class_labels' is not defined

In [247]:
o2 = pd.read_csv("Czechitas/commentsO2_clean.csv", encoding = "UTF-8", sep=';')
o2['message'] = o2['message'].fillna('a')
X_o2 = count_vect.transform(o2['message'])
o2_preds = clf.predict(X_o2)
o2.head(10)
# o2.to_csv('o2_sentiment10.csv', encoding='utf-8', sep=";")

Unnamed: 0,from_id,from_name,message,created_time,likes_count,comments_count,id
0,1655155837834881,Miroslav Pleskot,FREE tarif bez internetu by se mi líbil daleko...,2014-01-13T16:06:49+0000,5,0,659919047380028_1982999
1,1103097663153070,Alena Matišcáková,"alespon neco, když už je to tak drahý... :D",2014-01-13T16:08:19+0000,4,0,659919047380028_1983002
2,10212111270199918,Tomáš Novotný,"Ehh to je fakt odvaz 1,5GB v dobe fullhd YouTu...",2014-01-13T16:08:32+0000,14,0,659919047380028_1983003
3,1423701194318386,Daniel Prell,hm ješte kdyby to bylo tak za 400kc mesícne kd...,2014-01-13T16:09:41+0000,5,0,659919047380028_1983010
4,10208934700671278,Tomáš Valder,Lepší jak nic :) Dekuji za 0.5GB navíc :),2014-01-13T16:12:20+0000,5,0,659919047380028_1983016
5,1625346820813591,Martin Machata,K smichu :))),2014-01-13T16:14:55+0000,2,0,659919047380028_1983022
6,10208495524245992,Jan Sedlácek,super:)),2014-01-13T16:15:26+0000,2,0,659919047380028_1983023
7,1119754194817221,Honza Vána,Super :-) me by zajímalo kdy už bude LTE dost...,2014-01-13T16:16:19+0000,5,0,659919047380028_1983026
8,10209064330305224,Rosta Filar,ja chci tarif: SMS zdarma a internet 1500MB z...,2014-01-13T16:20:36+0000,9,0,659919047380028_1983036
9,10202550788609056,Tomáš Tichý,"K smichu...kdyz po 3 dnech mi prijde sms,ze uz...",2014-01-13T16:22:50+0000,3,0,659919047380028_1983040


In [224]:
o2_1 = pd.DataFrame(['operator']= 'o22' ,o2['created_time'], o2['message'], o2['likes_count'], 
                    o2['comments_count'], o2['prediction'])

SyntaxError: keyword can't be an expression (<ipython-input-224-c40fb129627b>, line 1)

In [225]:
o2_final = pd.DataFrame(o2, columns =['operator', 'created_time', 'message', 'likes_count', 'comments_count','prediction'])

In [228]:
o2_final['operator'] = 'o2'
o2_final.head(10)

Unnamed: 0,operator,created_time,message,likes_count,comments_count,prediction
0,o2,2014-01-13T16:06:49+0000,FREE tarif bez internetu by se mi líbil daleko...,5,0,
1,o2,2014-01-13T16:08:19+0000,"alespon neco, když už je to tak drahý... :D",4,0,
2,o2,2014-01-13T16:08:32+0000,"Ehh to je fakt odvaz 1,5GB v dobe fullhd YouTu...",14,0,
3,o2,2014-01-13T16:09:41+0000,hm ješte kdyby to bylo tak za 400kc mesícne kd...,5,0,
4,o2,2014-01-13T16:12:20+0000,Lepší jak nic :) Dekuji za 0.5GB navíc :),5,0,
5,o2,2014-01-13T16:14:55+0000,K smichu :))),2,0,
6,o2,2014-01-13T16:15:26+0000,super:)),2,0,
7,o2,2014-01-13T16:16:19+0000,Super :-) me by zajímalo kdy už bude LTE dost...,5,0,
8,o2,2014-01-13T16:20:36+0000,ja chci tarif: SMS zdarma a internet 1500MB z...,9,0,
9,o2,2014-01-13T16:22:50+0000,"K smichu...kdyz po 3 dnech mi prijde sms,ze uz...",3,0,


In [227]:
o2.head(10)

Unnamed: 0,from_id,from_name,message,created_time,likes_count,comments_count,id
0,1655155837834881,Miroslav Pleskot,FREE tarif bez internetu by se mi líbil daleko...,2014-01-13T16:06:49+0000,5,0,659919047380028_1982999
1,1103097663153070,Alena Matišcáková,"alespon neco, když už je to tak drahý... :D",2014-01-13T16:08:19+0000,4,0,659919047380028_1983002
2,10212111270199918,Tomáš Novotný,"Ehh to je fakt odvaz 1,5GB v dobe fullhd YouTu...",2014-01-13T16:08:32+0000,14,0,659919047380028_1983003
3,1423701194318386,Daniel Prell,hm ješte kdyby to bylo tak za 400kc mesícne kd...,2014-01-13T16:09:41+0000,5,0,659919047380028_1983010
4,10208934700671278,Tomáš Valder,Lepší jak nic :) Dekuji za 0.5GB navíc :),2014-01-13T16:12:20+0000,5,0,659919047380028_1983016
5,1625346820813591,Martin Machata,K smichu :))),2014-01-13T16:14:55+0000,2,0,659919047380028_1983022
6,10208495524245992,Jan Sedlácek,super:)),2014-01-13T16:15:26+0000,2,0,659919047380028_1983023
7,1119754194817221,Honza Vána,Super :-) me by zajímalo kdy už bude LTE dost...,2014-01-13T16:16:19+0000,5,0,659919047380028_1983026
8,10209064330305224,Rosta Filar,ja chci tarif: SMS zdarma a internet 1500MB z...,2014-01-13T16:20:36+0000,9,0,659919047380028_1983036
9,10202550788609056,Tomáš Tichý,"K smichu...kdyz po 3 dnech mi prijde sms,ze uz...",2014-01-13T16:22:50+0000,3,0,659919047380028_1983040
