# Detecting Insults in Social Networks

![](https://pbs.twimg.com/media/CkEyfjKUUAURpd9.jpg)

In [14]:
import re, string

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

##  Load raw data

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.findall(r'\b[a-z]+\b', text)
    return ' '.join(text)

In [3]:
training_data = pd.read_csv('train.csv')
training_data.head(7)

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
5,0,20120620171226Z,"""@SDL OK, but I would hope they'd sign him to ..."
6,0,20120503012628Z,"""Yeah and where are you now?"""


In [4]:
training_data['cleaned_comment'] = training_data['Comment'].map(clean_text)

In [5]:
training_data.head(7)

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",i really don t understand your point it seems ...
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ...",a of canadians can and has been wrong before n...
3,0,,"""listen if you dont wanna get married to a man...",listen if you dont wanna get married to a man ...
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",c b xu bi t xecnh c ho kh nc ng d ng cu xed ch...
5,0,20120620171226Z,"""@SDL OK, but I would hope they'd sign him to ...",sdl ok but i would hope they d sign him to a o...
6,0,20120503012628Z,"""Yeah and where are you now?""",yeah and where are you now


## Make some features

In [6]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english', max_features=50000)
count_vectorizer.fit(training_data['cleaned_comment'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
count_vectorizer.vocabulary_

{'fuck': 3185,
 'dad': 1781,
 'really': 30107,
 'don': 2254,
 'understand': 40540,
 'point': 23882,
 'mixing': 9152,
 'apples': 301,
 'oranges': 18837,
 'really don': 30172,
 'don understand': 2307,
 'understand point': 40614,
 'point mixing': 23951,
 'mixing apples': 9153,
 'apples oranges': 302,
 'really don understand': 30178,
 'understand point mixing': 40616,
 'point mixing apples': 23952,
 'mixing apples oranges': 9154,
 'canadians': 1045,
 'wrong': 48282,
 'nunless': 16508,
 'idea': 4038,
 'proof': 27049,
 'perfect': 21899,
 'chances': 1166,
 'inadvertently': 4144,
 'kill': 4603,
 'son': 33852,
 'daughter': 1822,
 'breaks': 898,
 'regard': 31109,
 'damage': 1792,
 'like': 4916,
 'wartime': 44263,
 'sorry': 33866,
 'mail': 5274,
 'wrong nunless': 48325,
 'nunless supportive': 16509,
 'proof perfect': 27066,
 'perfect chances': 21900,
 'regard collateral': 31114,
 'wartime sorry': 44264,
 'wrong nunless supportive': 48326,
 'nunless supportive idea': 16510,
 'proof perfect chances

In [10]:
X = count_vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

In [11]:
X

<3947x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 94859 stored elements in Compressed Sparse Row format>

## Cross-validate

### Split data

In [None]:
mask = [bool(np.random.binomial(1, .75)) for _ in range(X.shape[0])]

In [None]:
mask

In [None]:
X[np.array(mask)]

In [None]:
sum(mask) 

In [None]:
y[mask].shape[0]

In [None]:
y[~mask].shape

In [12]:
# mAke this with sklearn 
def split_data(X, y, p=.75):
    mask = np.array([bool(np.random.binomial(1, p)) for _ in range(X.shape[0])])
    
    X_train = X[mask]
    y_train = y[mask]
    X_validation = X[~mask]
    y_validation = y[~mask]
    
    return X_train, y_train, X_validation, y_validation

In [15]:
X_train, y_train, X_validation, y_validation = split_data(X, y)

In [16]:
X_train.shape

(2941, 50000)

### Fit a model on training data

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Validate model on validation data

In [18]:
from sklearn.metrics import accuracy_score


predictions = model.predict(X_validation)
validation_score = accuracy_score(y_validation, predictions)

print('Validation Score:', validation_score)

Validation Score: 0.8131212723658051


In [19]:
baseline_predictions = np.zeros(predictions.shape[0])

In [20]:
baseline_validation_score = accuracy_score(y_validation, baseline_predictions)

print('Validation Score:', baseline_validation_score)

Validation Score: 0.7037773359840954


## Remember, everything is a hyper-parameter.. usalo copialo


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [22]:
class PredictionPipeline:
    
    def __init__(self, ngram_range, vectorizer_class, model_class, training_data):
        self.ngram_range=ngram_range
        self.vectorizer_class=vectorizer_class
        self.model_class=model_class
        self.training_data=training_data
        self.vectorizer = None
        self.X = None
        self.y = None
        self.model = None
        self.validation_score = None
        
    def run(self):
        self._fit_vectorizer()
        self._featurize_text()
        self._split_train_and_validation_sets()
        self._fit_model_on_training_data()
        self._validate_model_on_validation_set()
        
        print(
            """
            Vectorizer Class: {vectorizer_class}\n\
            N-gram Range: {ngram_range}\n\
            Model Class: {model_class}\n\
            Validation Score: {validation_score}
            """.format(

            vectorizer_class=repr(self.vectorizer_class.__name__), 
            ngram_range=self.ngram_range, 
            model_class=repr(self.model_class.__name__), 
            validation_score=round(self.validation_score, 4)

            )
        )

    def _fit_vectorizer(self):
        self.vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, 
                                     stop_words='english', max_features=50000)
        self.vectorizer.fit(self.training_data['cleaned_comment'])
    
    def _featurize_text(self):
        self.X = self.vectorizer.transform(self.training_data['cleaned_comment'])
        self.y = self.training_data['Insult']

    def _split_train_and_validation_sets(self):
        self.X_train, self.y_train, self.X_validation, self.y_validation = split_data(
            self.X, self.y)

    def _fit_model_on_training_data(self):
        self.model = self.model_class()
        self.model.fit(self.X_train, self.y_train)

    def _validate_model_on_validation_set(self):
        predictions = self.model.predict(self.X_validation)
        self.validation_score = accuracy_score(self.y_validation, predictions)      

In [23]:
results = {}

for ngram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
    for vectorizer_class in [CountVectorizer, TfidfVectorizer]:
        for model_class in [LogisticRegression, LinearSVC, RandomForestClassifier]:
            
            # run prediction pipeline
            prediction_pipeline = PredictionPipeline(
                ngram_range=ngram_range,
                vectorizer_class=vectorizer_class,
                model_class=model_class,
                training_data=training_data
            )
            
            prediction_pipeline.run()
            
            # add hyper-parameters to `results` dictionary
            results[str(prediction_pipeline.validation_score)] = {
                    'vectorizer_class': prediction_pipeline.vectorizer_class,
                    'ngram_range': prediction_pipeline.ngram_range,
                    'model_class': prediction_pipeline.model_class
            }


            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.825
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.7949
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.8191
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.799
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.833
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.7981
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'LogisticRegression'
            Validation Score: 0.8259
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'LinearSVC'
            Validation Score: 0.8022
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.8047
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'LogisticRegression'
            Validation Score: 0.7915
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'LinearSVC'
            Validation Score: 0.8203
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 2)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.7944
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'LogisticRegression'
            Validation Score: 0.8409
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'LinearSVC'
            Validation Score: 0.811
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.8093
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'LogisticRegression'
            Validation Score: 0.757
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'LinearSVC'
            Validation Score: 0.8193
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 3)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.772
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'LogisticRegression'
            Validation Score: 0.8221
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'LinearSVC'
            Validation Score: 0.7943
            





            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.7965
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'LogisticRegression'
            Validation Score: 0.7677
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'LinearSVC'
            Validation Score: 0.8103
            





            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 4)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.8025
            


In [24]:
top_3_scores = sorted(results.keys(), reverse=True)[:3]

for score in top_3_scores:
    print('Score: {score}\nParameters: {parameters}\n'.format(
        score=score, parameters=results[score]))

Score: 0.8409090909090909
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 3), 'model_class': <class 'sklearn.linear_model.logistic.LogisticRegression'>}

Score: 0.8329918032786885
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, 'ngram_range': (1, 1), 'model_class': <class 'sklearn.svm.classes.LinearSVC'>}

Score: 0.8259149357072205
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 2), 'model_class': <class 'sklearn.linear_model.logistic.LogisticRegression'>}



## Train final model

In [25]:
top_score_key = top_3_scores[0]

In [26]:
vectorizer_class = results[top_score_key]['vectorizer_class']
ngram_range = results[top_score_key]['ngram_range']
model_class = results[top_score_key]['model_class']

# fit vectorizer
vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, stop_words='english', max_features=50000)
vectorizer.fit(training_data['cleaned_comment'])

# transform text
X = vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

# fit model on training data
model = model_class()
model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Run it live

In [None]:
while True:
    input_string = input('Please enter a string: ')
    input_string = clean_text(input_string)
    x_test = vectorizer.transform([input_string])
    
    prediction = model.predict(x_test)[0]
    print('Insult?: {}'.format( bool(prediction)))

Please enter a string: fuck you
Insult?: False
Please enter a string: you are stupid
Insult?: True
Please enter a string: eres marica
Insult?: False
Please enter a string: end
Insult?: False
