In [2]:
from Script_Processing.machine_learning_wrapper import MachineLearningWrapper
import pandas as pd
import numpy as np
import pickle
import eli5



# Import Annotated Data

In [None]:
annotated = pd.read_pickle("")

In [None]:
clean_annotated = list(annotated["clean_tweet_all_texts"])
stem_annotated = list(annotated["stemmed_all_texts"])
lemma_annotated = list(annotated["lemmatized_all_texts"])
lemma_stem_annotated = list(annotated["lemmatize_stem_all_texts"])

datasets = {"clean":clean_annotated, 
            "stem": stem_annotated, 
            "lemma": lemma_annotated, 
            "lemma_stem":lemma_stem_annotated
           }
y = np.array(list(annotated["Encode"]))
N = len(clean_annotated)

## Construct stop words

### Just run this

If you need to add more stop words, one option is to create them in a separate notebook using the included script and load them here

In [5]:
import pickle
stop_words_dict = pickle.load(open("stop_words_dict.pkl", "rb"))

### Build Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
mode = "lemma_stem"

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_features = 500, 
                             max_df=0.15, min_df = 0.01,
                                 stop_words=stop_words_dict[mode],
                             ngram_range=(1,1)
                            )

In [None]:
X = vectorizer.fit_transform(np.array(datasets[mode]))

# Machine Learning

In [5]:
ml_wrapper = MachineLearningWrapper()
ml_wrapper.set_target_names(["Target", "Other"])

In [11]:
classifiers = ml_wrapper.get_fresh_classifiers()

In [13]:
select_classifiers = [classifiers[4], classifiers[5]]
# only grad descent SVM and LR for speed. 
# later on the full classifier set
select_classifiers

In [None]:
y_scores, y_preds, y_true = ml_wrapper.perform_training(
    X_res, y_res, select_classifiers, verbose = True)

In [None]:
ml_wrapper.calculate_scores(y_scores, y_preds, y_true)

In [None]:
## Use SVM to select best features for convenience
svm = select_classifiers[0]

In [None]:
best_clf, select_from_model = ml_wrapper.fit_final_classifier(
    svm, X_res, y_res)

# Predict on production data = unannotated - annotated

In [None]:
def apply_prediction(df, column):
    X = vectorizer.transform(df[column])
    X_select = select_from_model.transform(X)
    prediction=best_clf[0].predict(X_select)
    df["Prediction"]=np.ndarray.tolist(prediction)
    print(df["Prediction"].value_counts())
    return df

In [None]:
unannotated = pd.read_pickle("")

In [None]:
unannotated = apply_prediction(unannotated, "lemmatize_stem_tweet")
# lemmatize_stem_tweet is the column name in our dataset, 
# but you can pass whichever column name your dataframe has

In [None]:
relevant_tweets = unannotated[unannotated["Prediction"] == 1]

In [None]:
relevant_tweets.to_pickle("predicted_tweets.pkl")