In [6]:
import numpy as np
import re
from tqdm import tqdm
import pandas as pd
from sklearn.feature_extraction import text
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import pickle
import warnings
warnings.filterwarnings("ignore")

In [7]:
class MyLogisticRegression:
    
    def __init__(self, learning_rate = 1, num_iterations = 2000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.w = []
        self.b = 0
        
    def initialize_weight(self,dim):
        """
        This function creates a vector of zeros of shape (dim, 1)      for w and initializes b to 0.
        Argument:
        dim -- size of the w vector we want (or number of parameters  in this case)
        """
        w = np.zeros((dim,1))
        b = 0
        return w, b

    def sigmoid(self,z):
        """
        Compute the sigmoid of z
        Argument:
        z -- is the decision boundary of the classifier
        """
        s = 1/(1 + np.exp(-z)) 
        return s
    
    def hypothesis(self,w,X,b):
        """
        This function calculates the hypothesis for the present model
        Argument:
         w -- weight vector
         X -- The input vector
         b -- The bias vector
        """
        H = self.sigmoid(np.dot(w.T,X)+b) 
        return H
    
    def cost(self,H,Y,m):
        """
        This function calculates the cost of hypothesis
        Arguments: 
         H -- The hypothesis vector 
         Y -- The output 
         m -- Number training samples
        """
        cost = -np.sum(Y*np.log(H)+ (1-Y)*np.log(1-H))/m 
        cost = np.squeeze(cost)   
        return cost
    
    def cal_gradient(self, w,H,X,Y):
        """
        Calculates gradient of the given model in learning space
        """
        m = X.shape[1]
        dw = np.dot(X,(H-Y).T)/m
        db = np.sum(H-Y)/m
        grads = {"dw": dw,
                 "db": db}
        return grads
 
    def gradient_position(self, w, b, X, Y):
        """
        It just gets calls various functions to get status of learning model
        Arguments:
         w -- weights, a numpy array of size (no. of features, 1)
         b -- bias, a scalar
         X -- data of size (no. of features, number of examples)
         Y -- true "label" vector (containing 0 or 1 ) of size (1, number of examples)
        """
  
        m = X.shape[1]
        H = self.hypothesis(w,X,b)         # compute activation
        cost = self.cost(H,Y,m)               # compute cost
        grads = self.cal_gradient(w, H, X, Y) # compute gradient
        
        return grads, cost
    
    def gradient_descent(self, w, b, X, Y, print_cost = False):
        """
        This function optimizes w and b by running a gradient descent algorithm

        Arguments:
        w — weights, a numpy array of size (num_px * num_px * 3, 1)
        b — bias, a scalar
        X -- data of size (no. of features, number of examples)
        Y -- true "label" vector (containing 0 or 1 ) of size (1, number of examples)
        print_cost — True to print the loss every 100 steps

        Returns:
        params — dictionary containing the weights w and bias b
        grads — dictionary containing the gradients of the weights and bias with respect to the cost function
        costs — list of all the costs computed during the optimization, this will be used to plot the learning curve.
        """
 
        costs = []
 
        for i in range(self.num_iterations):
        # Cost and gradient calculation 
            grads, cost = self.gradient_position(w,b,X,Y)
 
 
            # Retrieve derivatives from grads
            dw = grads['dw']
            db = grads['db']
 
            
            # update rule 
            w = w - (self.learning_rate * dw) 
            b = b - (self.learning_rate * db)
 
            # Record the costs
            if i % 100 == 0:
                costs.append(cost)
 
            # Print the cost every 100 training iterations
            if print_cost and i % 100 == 0:
                 print ('Cost after iteration %i: %f' %(i, cost))
 
 
        params = {'w': w,
                  'b': b}
 
        grads = {'dw': dw,
                 'db': db}
 
        return params, grads, costs

    def predict(self,X):
        '''
        Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)

        Arguments:
        w -- weights, a numpy array of size (n, 1)
        b -- bias, a scalar
        X -- data of size (num_px * num_px * 3, number of examples)

        Returns:
        Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
        '''
        
        X = np.array(X)
        m = X.shape[1]
  
        Y_prediction = np.zeros((1,m))
  
        w = self.w.reshape(X.shape[0], 1)
        b = self.b
        # Compute vector "H" 
        H = self.hypothesis(w, X, b)
 
        for i in range(H.shape[1]):
        # Convert probabilities H[0,i] to actual predictions p[0,i]
            if H[0,i] >= 0.5:
                Y_prediction[0,i] = 1
            else: 
                Y_prediction[0,i] = 0
   
        return Y_prediction

    def train_model(self, X_train, Y_train, X_test, Y_test, print_cost = False):
        """
        Builds the logistic regression model by calling the function you’ve implemented previously

        Arguments:
        X_train — training set represented by a numpy array of shape (features, m_train)
        Y_train — training labels represented by a numpy array (vector) of shape (1, m_train)
        X_test — test set represented by a numpy array of shape (features, m_test)
        Y_test — test labels represented by a numpy array (vector) of shape (1, m_test)
        print_cost — Set to true to print the cost every 100 iterations

        Returns:
        d — dictionary containing information about the model.
        """
        # initialize parameters with zeros 
        dim = np.shape(X_train)[0]
        w, b = self.initialize_weight(dim)
        # Gradient descent 
        parameters, grads, costs = self.gradient_descent(w, b, X_train, Y_train, print_cost = False)
 
        # Retrieve parameters w and b from dictionary “parameters”
        self.w = parameters['w']
        self.b = parameters['b']
 
        # Predict test/train set examples 
        Y_prediction_test = self.predict(X_test)
        Y_prediction_train = self.predict(X_train)
        # Print train/test Errors
        train_score = 100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100
        test_score = 100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100
        print('train accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
        print('test accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
        d = {'costs': costs,
             'Y_prediction_test': Y_prediction_test, 
             'Y_prediction_train' : Y_prediction_train, 
             'w' : self.w, 
             'b' : self.b,
             'learning_rate': self.learning_rate,
             'num_iterations': self.num_iterations,
             'train accuracy': train_score,
             'test accuracy': test_score}
 
        return d

In [8]:
#Testing on a small dataset
#Dataset
X_train = np.array([[5,6,1,3,7,4,10,1,2,0,5,3,1,4],[1,2,0,2,3,3,9,4,4,3,6,5,3,7]])
Y_train = np.array([[0,0,0,0,0,0,0,1,1,1,1,1,1,1]])
X_test  = np.array([[2,3,3,3,2,4],[1,1,0,7,6,5]])
Y_test  = np.array([[0,0,0,1,1,1]])

In [9]:
clf = MyLogisticRegression()
d = clf.train_model(X_train, Y_train, X_test, Y_test)
print (d["train accuracy"])

train accuracy: 100.0 %
test accuracy: 100.0 %
100.0


# Считать данные

In [10]:
df = pd.read_csv('/Users/i.pile/Downloads/archive-5/Reviews.csv')


In [11]:
df.drop_duplicates(subset={'UserId', 'ProfileName', 'Time', 'Text'}, inplace=True)

In [12]:
df['Label'] = 0
df.loc[df['Score'] > 3, ['Label']] = 1

# Доля нормальных отзывов

In [13]:
df.Label.mean()

0.7794624974297659

In [14]:
# Накопали сокращения http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [15]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [16]:
import swifter

In [17]:
df['Text_Cleansed'] = df.Text.swifter.apply(clean_text)

Pandas Apply:   0%|          | 0/393933 [00:00<?, ?it/s]

# Лемматизация

In [19]:
lemm = nltk.stem.WordNetLemmatizer()
df['lemmatized_text'] = df.Text_Cleansed.swifter.apply(lambda words: list(map(lemm.lemmatize, words)))

Pandas Apply:   0%|          | 0/393933 [00:00<?, ?it/s]

# Мешок слов

In [18]:
bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
x = bow_converter.fit_transform(df['Text_Cleansed'])

words = bow_converter.get_feature_names()
len(words)

KeyboardInterrupt: 

In [25]:
training_data, test_data = sklearn.model_selection.train_test_split(df, train_size = 0.3, random_state=42)

In [26]:
bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False) 

In [27]:
X_tr_bow = bow_transform.fit_transform(training_data['Text_Cleansed'])

In [28]:
X_te_bow = bow_transform.transform(test_data['Text_Cleansed'])

In [29]:
y_tr = training_data['Label']
y_te = test_data['Label']

# TF-IDF

In [5]:
tfidf_transform = text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)

NameError: name 'X_tr_bow' is not defined

In [None]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model