# Ładowanie bibliotek

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

RE_SPACES = re.compile("\s+")
RE_HASHTAG = re.compile("[@#][_a-z0-9]+")
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")

In [2]:
test_data = pd.read_csv('test.csv', sep=',', na_values=['?'])
train_data = pd.read_csv('train.csv', sep=',', na_values=['?'])

# Informacje o danych
a) przykładowe dane:

In [3]:
test_data.head(5)

Unnamed: 0,Id,Product Name,Brand Name,Price,Reviews
0,202646,Huawei P8 Lite ALE-L23 Factory Unlocked 16GB L...,Huawei,184.9,Excelent
1,202647,Huawei P8 Lite ALE-L23 Factory Unlocked 16GB L...,Huawei,184.9,Fit perfectly. Excelent.
2,202648,Huawei P8 Lite ALE-L23 Factory Unlocked 16GB L...,Huawei,184.9,"It has a very good relation, price/performance."
3,202649,Huawei P8 Lite ALE-L23 Factory Unlocked 16GB L...,Huawei,184.9,Like it a lot
4,202650,Huawei P8 Lite ALE-L23 Factory Unlocked 16GB L...,Huawei,184.9,"Dear , i had a problem wich the wifi senial !!..."


b) rozmiar

In [4]:
test_data.shape

(211195, 5)

In [5]:
class Tokenizer():
    @staticmethod
    def tokenize(text):
        pass
    
class BeforeTokenizationNormalizer():
    @staticmethod
    def normalize(text):
        text = str(text)
        text = text.strip().lower()
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&lt;', '<')
        text = text.replace('&gt;', '>')
        text = text.replace('&amp;', '&')
        text = text.replace('&pound;', u'£')
        text = text.replace('&euro;', u'€')
        text = text.replace('&copy;', u'©')
        text = text.replace('&reg;', u'®')
        return text

In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

porter = nltk.PorterStemmer()

class NltkTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        text = nltk.word_tokenize(text)
        for w in range(len(text)):
            text[w] = porter.stem(text[w])
        return text

In [7]:
# for i in train_data.index:
#     review = BeforeTokenizationNormalizer.normalize(train_data.iat[i, 4])
#     words_nltk = NltkTokenizer.tokenize(review)
#     print(words_nltk)

In [8]:
from collections import Counter
words = Counter()

for i in train_data.index:
    review = BeforeTokenizationNormalizer.normalize(train_data.iat[i, 4])
    words.update(NltkTokenizer.tokenize(review))

In [9]:
words.most_common()

[('.', 388832),
 ('the', 337576),
 ('i', 261670),
 ('it', 249429),
 (',', 236959),
 ('phone', 201344),
 ('and', 199329),
 ('a', 178419),
 ('to', 172959),
 ('is', 136464),
 ('thi', 105022),
 ('for', 94337),
 ('!', 87733),
 ('wa', 79360),
 ('not', 78951),
 ('with', 76175),
 ('my', 75552),
 ('of', 72734),
 ('in', 67759),
 ('that', 65872),
 ('but', 63489),
 ('work', 59462),
 ('on', 58620),
 ("n't", 55743),
 ('have', 55429),
 ('you', 51284),
 ('great', 47260),
 ('good', 47014),
 ('use', 44314),
 ('as', 43843),
 ('veri', 42679),
 ('so', 39681),
 ("'s", 35530),
 ('be', 33063),
 ('had', 31573),
 ('no', 30453),
 ('like', 30092),
 ('do', 30018),
 ('one', 28659),
 ('all', 28207),
 ('get', 27301),
 ('love', 27256),
 ('screen', 26600),
 ('at', 26476),
 ('me', 26324),
 (')', 25590),
 ('just', 25285),
 ('would', 24439),
 ('batteri', 23981),
 ('...', 23777),
 ('or', 23697),
 ('(', 23475),
 ('from', 23405),
 ('if', 23160),
 ('can', 22928),
 ('an', 22846),
 ('ha', 22801),
 ('new', 22503),
 ('are', 22469

In [10]:
punctuation_marks = [".", ",", "(", ")", "?", "[", "]", "-", "_", "--", "''", ":", ";", "&", "``", "...", "%" ]
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him",
            "himself", "his", "how", "i", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my",
            "myself", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "sha",
            "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves",
            "then", "there", "there's", "these", "they", "this", "those", "through", "to", "until", "up", "very",
            "was", "we", "were", "what", "when", "where", "which", "while", "who","whom", "with", "would", "you",
            "your", "yours", "yourself", "yourselves",
            "n't", "'s", "'ll", "'re", "'d", "'m", "'ve",
            "above", "again", "against", "below", "but",  "few", "if",  "off",
            "out", "over", "same", "too", "under", "why"]
# "no", "nor", "not","cannot", "down",

In [11]:
for p in punctuation_marks:
    if (words[p] > 0):
        del words[p]
        
# for s in stopwords:
#     if (words[s] > 0):
#         del words[s]

In [12]:
words.most_common()

[('the', 337576),
 ('i', 261670),
 ('it', 249429),
 ('phone', 201344),
 ('and', 199329),
 ('a', 178419),
 ('to', 172959),
 ('is', 136464),
 ('thi', 105022),
 ('for', 94337),
 ('!', 87733),
 ('wa', 79360),
 ('not', 78951),
 ('with', 76175),
 ('my', 75552),
 ('of', 72734),
 ('in', 67759),
 ('that', 65872),
 ('but', 63489),
 ('work', 59462),
 ('on', 58620),
 ("n't", 55743),
 ('have', 55429),
 ('you', 51284),
 ('great', 47260),
 ('good', 47014),
 ('use', 44314),
 ('as', 43843),
 ('veri', 42679),
 ('so', 39681),
 ("'s", 35530),
 ('be', 33063),
 ('had', 31573),
 ('no', 30453),
 ('like', 30092),
 ('do', 30018),
 ('one', 28659),
 ('all', 28207),
 ('get', 27301),
 ('love', 27256),
 ('screen', 26600),
 ('at', 26476),
 ('me', 26324),
 ('just', 25285),
 ('would', 24439),
 ('batteri', 23981),
 ('or', 23697),
 ('from', 23405),
 ('if', 23160),
 ('can', 22928),
 ('an', 22846),
 ('ha', 22801),
 ('new', 22503),
 ('are', 22469),
 ('when', 21164),
 ('time', 20760),
 ('iphon', 20474),
 ('will', 20356),
 ('

In [13]:
def create_bow(documents, features, test):
    row = []
    col = []
    data = []

    labels = []

    for i in documents.index:
        review = BeforeTokenizationNormalizer.normalize(documents.iloc[i, 4])
        if(test == 1):
            label = documents.iloc[i, 5]
            labels.append(label)
        review_tokens = NltkTokenizer.tokenize(review)

        
        for token in set(review_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(documents), len(features))), labels

In [14]:
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
# from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

min_word_count = 50

common_words = list([k for k, v in words.most_common() if v > min_word_count])

feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

print("Training classifier...")
print("1. create bow")
X_train, y_train = create_bow(train_data, feature_dict, 1)
print("2. list of classes")
list_of_labels = list(set(y_train))
print("3. classifier")
# classifier = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=23, verbose=True)
# classifier = SVC()
classifier = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=500, alpha=0.0001,
                     solver='sgd', random_state=21,tol=0.000000001, verbose=True)
print("4. fit")
classifier.fit(X_train, y_train)

print("Testing...")
print("1. create bow")
X_test, y_test = create_bow(test_data, feature_dict, 0)
print("2. predict")
predicted = classifier.predict(X_test)

print("=================== Results ===================")
print(predicted)


  from numpy.core.umath_tests import inner1d


Training classifier...
1. create bow
2. list of classes
3. classifier
4. fit
Iteration 1, loss = 1.22826789
Iteration 2, loss = 0.94341045
Iteration 3, loss = 0.84488436
Iteration 4, loss = 0.80195711
Iteration 5, loss = 0.77662253
Iteration 6, loss = 0.75767748
Iteration 7, loss = 0.74203179
Iteration 8, loss = 0.72767900
Iteration 9, loss = 0.71455300
Iteration 10, loss = 0.70216694
Iteration 11, loss = 0.69024219
Iteration 12, loss = 0.67848783
Iteration 13, loss = 0.66665284
Iteration 14, loss = 0.65470344
Iteration 15, loss = 0.64258551
Iteration 16, loss = 0.63007006
Iteration 17, loss = 0.61695216
Iteration 18, loss = 0.60384606
Iteration 19, loss = 0.59030274
Iteration 20, loss = 0.57676512
Iteration 21, loss = 0.56268966
Iteration 22, loss = 0.54853531
Iteration 23, loss = 0.53375672
Iteration 24, loss = 0.51883586
Iteration 25, loss = 0.50416211
Iteration 26, loss = 0.48894718
Iteration 27, loss = 0.47406218
Iteration 28, loss = 0.45957661
Iteration 29, loss = 0.44497255
Iter

In [15]:
test_data['Rating'] = predicted
results = test_data[['Id','Rating']]

In [16]:
results.to_csv("results.csv", index=False)