### Import all the necessary packages. Sklearn is the most essential library used that provides feature extraction, training SVM and evaluating the performance of the model.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pandas as pd
import re

### Load the dataset. Pandas is a data analysis library that lets you read the csv file from disk and load it to the working memory for further processing.

In [2]:
data = pd.read_csv('text_classification_dataset.csv')
data = shuffle(data)

### Perform a little preprocessing of the text data.

In [3]:
contractions_mapping = {"i'm": 'i am',
 "i've": 'i have',
 "you're": 'you are',
 'dont': 'do not',
 "don't": 'do not',
 "can't": 'can not',
 'cant': 'can not',
 "what's": 'what is',
 'whats': 'what is',
 "how's": 'how is',
 'hows': 'how is',
 "\\'nt": 'not',
 '^\\w\\s': '',
 "\\'s": '',
 '\\n': ' '}

def regex_clean(doc):
    doc = doc.lower()
    for k,v in contractions_mapping.items():
        doc = re.sub(k,v,doc)
    return doc

In [4]:
data['reviews'] = data['reviews'].apply(lambda x:regex_clean(x))

### We are going to use Tfidf for feature extraction. TfIdf score is the score that calculates the relative importance for each term in the vocabulary. The idea behind tfidf score is that the words stop words like "the", "is", "a" etc are more frequent in the vocabulary, therefore these words should have lesser importance as compared to "awesome", "amazing", "awful" etc.

In [5]:
tfidf = TfidfVectorizer(stop_words=None)
tfidf.fit(data['reviews'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
train, test = train_test_split(data, test_size=0.2)

In [7]:
X_train, Y_train = list(train['reviews']), list(train['labels'])
X_test, Y_test = list(test['reviews']), list(test['labels'])

In [8]:
tfidf_X_train = tfidf.transform(X_train)
tfidf_X_test = tfidf.transform(X_test)

In [9]:
clf = LinearSVC()
clf.fit(tfidf_X_train, Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [10]:
preds = clf.predict(tfidf_X_test)
accuracy_score(Y_test, preds)

0.82