# global functions

In [2]:
import pandas as pd
import numpy as np
import re
import string

In [3]:
def handle_nan(data):
    data2 = data.fillna(" ")
    return data2

def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)  
    return text

# remove stop words
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
def stemming(text):
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ouyan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# read file and clean

In [5]:
train = pd.read_csv("datasets/train.csv")
predict = pd.read_csv("datasets/predict.csv")

train = train[0:1000]
predict = predict[0:1000]

train = handle_nan(train)
predict = handle_nan(predict)

x = train['text'].apply(wordopt)
y = train['label']
p = predict['text'].apply(wordopt)

#x = train['text'].apply(stemming)
#p = predict['text'].apply(stemming)

# sklearn prepare functions

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

vectorization = CountVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)

In [13]:
def manual_testing(model, news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"]
    new_x_test = vectorization.transform(new_x_test)
    return "proba of 0 & 1:", model.predict_proba(new_x_test)

def preciction(model, data):
    model_name = namestr(model,globals())[0]
    new_def_test = p
    new_x_test = vectorization.transform(new_def_test)

    predictions_test = pd.DataFrame(model.predict(new_x_test))
    predictions_test_predict_proba = pd.DataFrame(model.predict_proba(new_x_test))
    test_id = pd.DataFrame(predict["id"])

    submission = pd.concat([test_id, predictions_test, predictions_test_predict_proba],axis=1)
    submission.columns = ["id",model_name,"proba_0","proba_1"]
    submission.to_csv("out_{}.csv".format(model_name),index=False)

def cross_val(model):
    scores = cross_val_score(model, x_train,y_train, scoring="neg_mean_squared_error", cv=2)
    tree_rmse_scores = np.sqrt(-scores)
    return ("Scores:", scores),("Mean:", scores.mean()),("Standard deviation:", scores.std())

def Confusion_Matrix_f1_score(model):    
    y_train_pred = cross_val_predict(model, x_train,y_train, cv=3)
    return confusion_matrix(y_train,y_train_pred), f1_score(y_train,y_train_pred)

## compare different sklearn algos

In [14]:
from sklearn.naive_bayes import MultinomialNB
Bayesian = MultinomialNB()
Bayesian.fit(x_train,y_train)
Bayesian.score(x_test, y_test)
Confusion_Matrix_f1_score(Bayesian)

(array([[380,  17],
        [120, 283]], dtype=int64),
 0.8051209103840684)

In [None]:
preciction(Bayesian,p)
manual_testing(Bayesian, str("Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."))

In [None]:
from sklearn.linear_model import LogisticRegression

Logistic = LogisticRegression()
Logistic.fit(x_train,y_train)
Confusion_Matrix_f1_score(Logistic)

In [None]:
preciction(Logistic,p)
manual_testing(Logistic, str("Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."))

In [None]:
from sklearn.tree import DecisionTreeClassifier
DecisionTree = DecisionTreeClassifier()
DecisionTree.fit(x_train,y_train)
Confusion_Matrix_f1_score(DecisionTree)

In [None]:
preciction(DecisionTree,p)
manual_testing(DecisionTree, str("Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."))

In [None]:

from sklearn.neighbors import KNeighborsClassifier
k_neighbor = KNeighborsClassifier()
k_neighbor.fit(x_train,y_train)
Confusion_Matrix_f1_score(k_neighbor)

In [None]:
preciction(k_neighbor,p)
manual_testing(k_neighbor, str("Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."))

In [None]:
#14 min to run
from sklearn.svm import SVC
SVC1 = SVC()
SVC1.fit(x_train,y_train)
Confusion_Matrix_f1_score(SVC1)

In [None]:
#33 min to run
from sklearn.neural_network import MLPClassifier
mlp  = MLPClassifier(solver='lbfgs', activation='logistic')
mlp.fit(x_train,y_train)
Confusion_Matrix_f1_score(mlp)

In [None]:
preciction(mlp,p)
manual_testing(mlp, str("Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."))

# TF

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

#Deep learning libraries
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [18]:
def manual_predict_tf(model_name, str):
    data = []
    data.append(pred)
    p2 = pd.DataFrame(data)

    p2 = p2[0].apply(wordopt)

    tokenizer_p = Tokenizer(num_words = voc_size)
    tokenizer_p.fit_on_texts(p2)

    p2 = tokenizer_p.texts_to_sequences(p2)

    p2 = tf.keras.preprocessing.sequence.pad_sequences(p2, padding='post', maxlen=sent_length)

    return model_name.predict(p2)

def preciction_tf(model, data):
    model_name = namestr(model,globals())[0]
    predictions_test = pd.DataFrame(model.predict(data))
    test_id = pd.DataFrame(predict["id"])

    submission = pd.concat([test_id, predictions_test],axis=1)
    submission.columns = ["id",model_name]
    submission.to_csv("out_{}.csv".format(model_name),index=False)

In [19]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x, y, test_size=0.2)

voc_size = 5000

x_train_2=[one_hot(words,voc_size)for words in x_train_1] 
x_test_2=[one_hot(words,voc_size)for words in x_test_1] 
p_2=[one_hot(words,voc_size)for words in p] 


#tokenizer = Tokenizer(num_words=voc_size)
#tokenizer.fit_on_texts(x_train_1)
#
#x_train_2 = tokenizer.texts_to_sequences(x_train_1)
#x_test_2 = tokenizer.texts_to_sequences(x_test_1)
#
#tokenizer_p = Tokenizer(num_words=voc_size)
#tokenizer_p.fit_on_texts(p)
#p_2 = tokenizer_p.texts_to_sequences(p)

sent_length=5000
x_train_3 = tf.keras.preprocessing.sequence.pad_sequences(x_train_2, padding='post', maxlen=sent_length)
x_test_3 = tf.keras.preprocessing.sequence.pad_sequences(x_test_2, padding='post', maxlen=sent_length)
p_3 = tf.keras.preprocessing.sequence.pad_sequences(p_2, padding='post', maxlen=sent_length)

In [57]:
def build_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(voc_size, 32))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1))
    return model

rnn = build_model()
rnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = rnn.fit(x_train_3, y_train_1, epochs=50, validation_split=0.1, batch_size=64, shuffle=True, callbacks=[early_stop])

Epoch 1/50

KeyboardInterrupt: 

# using hyperameters

In [20]:
import keras_tuner as kt

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(voc_size, 32))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)))
    model.add(tf.keras.layers.Dense(hp.Int("dense",16,128,16), activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(
    build_model,
    overwrite = True,
    objective='val_loss',
    max_trials=5)

tuner.search(x_train_3, y_train_1, epochs=5, batch_size=64, validation_split=0.1)
best_model = tuner.get_best_models()[0]


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
32                |?                 |dense

Epoch 1/5


KeyboardInterrupt: 

In [None]:
#test loss, test acc
rnn.evaluate(x_test_3, y_test_1)

In [None]:
pred = str(
    "Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 million to a “super PAC” that supports Senator Rob Portman, the Republican freshman running for   in Ohio.) Getting involved in politics used to be seen as clashing with Silicon Valley’s value system: You transform the world by making problems obsolete, not solving them through Washington. Nor did entrepreneurs want to alienate whatever segment of customers did not agree with them politically. Such reticence is no longer in style here. “We’re a bunch of nerds not used to having a lot of limelight,” said Dave McClure, an investor who runs a tech incubator called 500 Startups."
    )
manual_predict_tf(rnn, pred)

In [None]:
preciction_tf(rnn, p_3)

# autokeras

In [None]:
import autokeras as ak
x_train_ak = x_train_1.to_numpy()
y_train_ak = y_train_1.to_numpy()
x_test_ak = x_test_1.to_numpy()
y_test_ak = y_test_1.to_numpy()


In [None]:
# Initialize the text classifier.
clf = ak.TextClassifier(
    #max_trials=100,
    #overwrite = False
) 
# Feed the text classifier with training data.
clf.fit(x_train_ak, y_train_ak, epochs=50)

In [None]:
#test loss, test acc
clf.evaluate(x_test_ak, y_test_ak)

In [None]:
pd.DataFrame(clf.predict(p.to_numpy()))

In [21]:
import os

import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files

import autokeras as ak


In [2]:
dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz",
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True,
)

In [3]:
# set path to dataset
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb")

classes = ["pos", "neg"]
train_data = load_files(
    os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes
)
test_data = load_files(
    os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes
)

x_train = np.array(train_data.data)
y_train = np.array(train_data.target)
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting

(25000,)
(25000,)
b'Zero Day leads you to think, even re-think why two'


In [15]:
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True, max_trials=1
)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

Trial 1 Complete [00h 00m 38s]
val_loss: 0.3694274425506592

Best val_loss So Far: 0.3694274425506592
Total elapsed time: 00h 00m 38s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit






INFO:tensorflow:Assets written to: .\text_classifier\best_model\assets


INFO:tensorflow:Assets written to: .\text_classifier\best_model\assets


[0.6374950408935547, 0.6080800294876099]


In [7]:
clf.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
)

In [8]:
split = 5000
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
clf.fit(
    x_train,
    y_train,
    epochs=2,
    # Use your own validation set.
    validation_data=(x_val, y_val),
)