# Classifying Posts by Simple Classification Model

In [2]:
SEED = 7789
import pandas as pd
import numpy as np
import random
from nltk.tokenize import word_tokenize
import nltk
import string, re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas_gbq as gbq
import pydata_google_auth

## Extracting All Posts including Keywords from BigQuery

In [4]:
def bigquery_auth():
    SCOPES = [
    'https://www.googleapis.com/auth/cloud-platform',
    'https://www.googleapis.com/auth/drive',]
    
    credentials = pydata_google_auth.get_user_credentials(
    SCOPES,
    # Set auth_local_webserver to True to have a slightly more convienient
    # authorization flow. Note, this doesn't work if you're running from a
    # notebook on a remote sever, such as over SSH or with Google Colab.
    auth_local_webserver=True,)
    
bigquery_auth()

In [23]:
immigration_query = '''
(SELECT
  *
FROM
  `1000_page_post.201501_to_201611_all`
WHERE
    TIMESTAMP(post_created_date_CT) >= TIMESTAMP('2015-05-01') AND
  (LOWER(post_name) LIKE "%immigration%" OR
  LOWER(post_message) LIKE "%immigration%" OR
  LOWER(post_caption) LIKE "%immigration%" OR
  LOWER(post_description) LIKE "%immigration%" OR
  LOWER(post_name) LIKE "%mexican%" OR
  LOWER(post_message) LIKE "%mexican%" OR
  LOWER(post_caption) LIKE "%mexican%" OR
  LOWER(post_description) LIKE "%mexican%" OR
  LOWER(post_name) LIKE "%muslim%" OR
  LOWER(post_message) LIKE "%muslim%" OR
  LOWER(post_caption) LIKE "%muslim%" OR
  LOWER(post_description) LIKE "%muslim%" OR
  LOWER(post_name) LIKE "%immigrant%" OR
  LOWER(post_message) LIKE "%immigrant%" OR
  LOWER(post_caption) LIKE "%immigrant%" OR
  LOWER(post_description) LIKE "%immigrant%"))
UNION ALL
(SELECT
  *
FROM
  `politician_post.201501_to_201611_all`
WHERE
    TIMESTAMP(post_created_date_CT) >= TIMESTAMP('2015-05-01') AND
  (LOWER(post_name) LIKE "%immigration%" OR
  LOWER(post_message) LIKE "%immigration%" OR
  LOWER(post_caption) LIKE "%immigration%" OR
  LOWER(post_description) LIKE "%immigration%" OR
  LOWER(post_name) LIKE "%mexican%" OR
  LOWER(post_message) LIKE "%mexican%" OR
  LOWER(post_caption) LIKE "%mexican%" OR
  LOWER(post_description) LIKE "%mexican%" OR
  LOWER(post_name) LIKE "%muslim%" OR
  LOWER(post_message) LIKE "%muslim%" OR
  LOWER(post_caption) LIKE "%muslim%" OR
  LOWER(post_description) LIKE "%muslim%" OR
  LOWER(post_name) LIKE "%immigrant%" OR
  LOWER(post_message) LIKE "%immigrant%" OR
  LOWER(post_caption) LIKE "%immigrant%" OR
  LOWER(post_description) LIKE "%immigrant%"))
'''

In [24]:
immigration = gbq.read_gbq(immigration_query, project_id='ntufbdata')

Downloading: 100%|██████████| 669076/669076 [05:45<00:00, 1938.78rows/s]


## Preprocessing

In [3]:
def concat_messages(df):
    df.post_name.fillna('', inplace=True)
    df.post_message.fillna('', inplace=True)
    df.post_description.fillna('', inplace=True)
    df['concat'] = df.post_name + ' ' + df.post_message + ' ' + df.post_description

In [12]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def tokenize_with_lemmentize(document ,lemmentize = True):
#Tokenizer and Lemmantizer for TfidfVectorizer
#First remove url, then remove stopwords and non-alphabet, and lemmantize the lower cased tokens. 

    tokenized_post = []
    lemmatizer = WordNetLemmatizer()
    removed = stopwords.words('english') + list(string.punctuation)
    document =  re.sub(r'http\S+', '', str(document))
    tokens = word_tokenize(document)
    
    words = [word.lower() for word in tokens if word.isalpha() and word not in removed]
    if lemmentize:
        words_lemmantized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
        tokenized_post += words_lemmantized
    else:
        tokenized_post += words
    return tokenized_post

### Concat Posts Content

In [25]:
col = 'concat'
immigration = immigration.drop_duplicates(subset=['post_id'], keep="first")
concat_messages(immigration)
target = immigration.dropna(subset=[col]).copy()
target[col] = target[col].str.replace('\r', '').str.lower()

### Tokenization and Bag of Words

In [14]:
df = pd.read_csv('/home3/r05322021/Desktop/FB_hatecrime/Data/label/immigration_label.csv', encoding='utf-8', engine='python')
df = df[(df.Mexican_related.isin([0,1])) & (df.Muslim_related.isin([0,1])) & (df.immigration_related.isin([0,1]))]
col = 'concat'
concat_messages(df)
target_train = df.dropna(subset=[col]).copy()
target_train[col] = target_train[col].str.replace('\r', '').str.lower()

In [26]:
# bow = CountVectorizer(tokenizer=tokenize_with_lemmentize, 
#                       token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', 
#                       min_df=5
#                      )
# bow = bow.fit(target_train[col])
X = bow.transform(target[col])

### TFIDF

In [27]:
tfidf = load('/home3/r05322021/Desktop/model/transformation/TFIDF_immigration.joblib')
tfidf_X = tfidf.transform(X)

## Predict and Check

In [33]:
def random_show(df, y_col, sample_size):
    indexes = random.sample(range(len(df[df[y_col] == 1])), k=sample_size)
    #for idx, p in random.sample(list(zip(indexes, pred)), k=min(len(indexes), 5)):
    for idx in indexes:
        row = target.iloc[idx]
        print('\n\n', '\n'.join([row.post_name, row.post_message, row.post_description]))

In [38]:
from joblib import load
y_col = 'Muslim_related'
svm_m = load(f'/home3/r05322021/Desktop/model/SVM/{y_col[:-8]}_svm.joblib')
immigration[y_col] = svm_m.predict(tfidf_X)

In [40]:
random_show(immigration, y_col, 20)



 Republican Town Plans to Stop Muslim Burials By Beheading Pig, Pouring Blood on Ground
The town is located in a heavily red county in Texas -- knowing that, do any of these comments even come close to surprising you?
Texas resident threatens to stop Islamic cemetery, which would allow local Muslims to bury their dead in-town, by dousing the land with pigs' blood.


 Chipotle closes 43 restaurants amid E. coli outbreak
Chipotle Mexican Grill has closed 43 of the chain's restaurants amid an E. coli outbreak.
"We immediately closed all of our restaurants in the area out of an abundance of caution, even though the vast majority of these restaurants have no reported problems."


 Hillary Clinton on Twitter
Sorry Scotland. But to the folks flying the Mexican flag near Trump’s property:  VIVA SCOTLAND!
“People in Scotland are not thrilled about Trump coming to their country. We know the feeling.”


 This Islamic Country Just Took A HUGE Leap Forward On LGBT Rights (VIDEO)
That moment when 

## Save Prediction

In [42]:
immigration.to_csv(r'/home3/r05322021/Desktop/FB Data/issue_post/Simple/immigration_predicted.csv', index=False)