In [5]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import re
import nltk

import matplotlib.pyplot as plt
import seaborn as sns

from flask import Flask, jsonify, request, render_template
from flask_cors import CORS, cross_origin
import pickle

from nltk import *
#nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
data = pd.read_csv("C:\\Users\\MaryWillcock\\Documents\\GitHub\\NLP_Flask_Test\\tweets.csv")
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [8]:
features = data.iloc[:, 10].values
labels = data.iloc[:, 1].values

In [73]:
def clean_n_vect(feat):
    processed_feats = []

    for sentence in range(0, len(feat)):
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', str(feat[sentence]))

        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        processed_feats.append(processed_feature)
    vectorizer = TfidfVectorizer (max_features=2300, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
    processed_feats = vectorizer.fit_transform(processed_feats).toarray()
    print(type(processed_feats))
    return processed_feats

In [74]:
processed_features = clean_n_vect(features)
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

<class 'numpy.ndarray'>


In [75]:
def eval_metrics(actual, pred):
        confusionMatrix = confusion_matrix(y_test,predictions)
        classificationReport = classification_report(y_test,predictions)
        accScore = accuracy_score(y_test, predictions)
        return confusionMatrix, classificationReport, accScore

In [76]:
n_est = 200
text_classifier = RandomForestClassifier(n_estimators=n_est, random_state=0)
text_classifier.fit(X_train, y_train)

predictions = text_classifier.predict(X_test)

(confusionMatrix, classificationReport, accScore) = eval_metrics(y_test, predictions)

print("Random Forest Classifier model (n_estimators=%f):" % (n_est))
print(confusionMatrix)
print(classificationReport)
print("  Accuracy: %s" % accScore)

Random Forest Classifier model (n_estimators=200.000000):
[[1726  108   36]
 [ 332  243   39]
 [ 141   60  243]]
              precision    recall  f1-score   support

    negative       0.78      0.92      0.85      1870
     neutral       0.59      0.40      0.47       614
    positive       0.76      0.55      0.64       444

    accuracy                           0.76      2928
   macro avg       0.71      0.62      0.65      2928
weighted avg       0.74      0.76      0.74      2928

  Accuracy: 0.755464480874317


# Single Response

Adjust code to accept single response and code it

In [91]:
rev2 = "You have the rudest employees! This lady kicked me off of my flight, unbelievable"

In [92]:
features2 = [rev2]
featz = np.concatenate((features, features2))
processed_features2 = clean_n_vect(featz)
processed_features2 = processed_features2[-2:]

<class 'numpy.ndarray'>


In [93]:
# Get the model's prediction
predictions2 = text_classifier.predict(processed_features2)

print("Prediction: ", predictions2[1])
prediction_proba = text_classifier.predict_proba(processed_features2)
print("Probabilities: ", prediction_proba[1])

Prediction:  negative
Probabilities:  [0.73  0.095 0.175]


In [None]:
app = Flask(__name__)
#CORS(app)

# function to check the file extension
def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route("/")
def home_page():
    return render_template('index.html')

@app.route('/') , methods=['GET', 'POST']
def predict():
    if request.method == 'POST':
        # check if there is a file in the request
        if 'file' not in request.files:
            return render_template('review.html', msg='No file selected')
        file = request.files['file']
        # if no file is selected
        if file.filename == '':
            return render_template('review.html', msg='No file selected')

        if file and allowed_file(file.filename):

            payload = request.json['data']
            values = [float(i) for i in payload.split(',')]

            input_variables = pd.DataFrame([values],
                                        columns=headers, 
                                        dtype=float,
                                        index=['input'])
            # Get the model's prediction
            prediction_proba = model.predict_proba(input_variables)
            prediction = (prediction_proba[0])[1]

            ret = '{"prediction":' + str(float(prediction)) + '}'

            return ret
    elif request.method == 'GET':
        return render_template('review.html')
# running REST interface, port=5000 for direct test
if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [04/Nov/2019 09:56:06] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [04/Nov/2019 09:56:06] "GET /favicon.ico HTTP/1.1" 404 -
[2019-11-04 09:56:07,906] ERROR in app: Exception on /review [GET]
Traceback (most recent call last):
  File "C:\Users\MaryWillcock\Anaconda3\lib\site-packages\flask\app.py", line 2446, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\MaryWillcock\Anaconda3\lib\site-packages\flask\app.py", line 1951, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\MaryWillcock\Anaconda3\lib\site-packages\flask\app.py", line 1820, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\MaryWillcock\Anaconda3\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\MaryWillcock\Anaconda3\lib\site-packages\flask\app.py", line 1949, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\M