In [9]:
#import models
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import classification_report
#imports
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm
import pandas as pd

In [10]:
_wnl = nltk.WordNetLemmatizer()
def normalize_word(w):
    return _wnl.lemmatize(w).lower()
def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]
def clean(text):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric,removed https and @mentions too
    text=re.sub(r'@[^ ]',' ',text)
    text=re.sub(r'https?://[^ ]+',' ',text)
    return " ".join(re.findall(r'\w+', text, flags=re.UNICODE)).lower()
def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

In [11]:
_wnl = nltk.WordNetLemmatizer()
def normalize_word(w):
    return _wnl.lemmatize(w).lower()
def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies)
        np.save(feature_file, feats)

    return np.load(feature_file)




def word_overlap_features(headlines, bodies):
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = [
            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
        X.append(features)
    return X


def refuting_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_headline = get_tokenized_lemmas(clean_headline)
        features = [1 if word in clean_headline else 0 for word in _refuting_words]
        X.append(features)
    return X


def polarity_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        features = []
        features.append(calculate_polarity(clean_headline))
        features.append(calculate_polarity(clean_body))
        X.append(features)
    return np.array(X)


def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def append_chargrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
        if gram in text_body[:100]:
            grams_first_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    features.append(grams_first_hits)
    return features


def append_ngrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in ngrams(text_headline, size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    return features


def hand_features(headlines, bodies):

    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean(headline).split(" "):
            if headline_token in clean(body):
                bin_count += 1
            if headline_token in clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in remove_stopwords(clean(headline).split(" ")):
            if headline_token in clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean(body)
        clean_headline = clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        X.append(binary_co_occurence(headline, body)
                 + binary_co_occurence_stops(headline, body)
                 + count_grams(headline, body))


    return X

In [12]:
def merger(body,stance):
    enter=[]
    for i in stance['BodyID']:
        enter.append(body['ArticleBody'][(body.BodyID[body.BodyID==i].index.tolist())[0]])
    stance.insert(3,'ArticleBody',enter)

In [13]:
def stance_detect(train_stances,Test,model):
    Labels=train_stances.drop(['Headline','BodyID','ArticleBody'],axis=1)
    Label_Result=pd.DataFrame()
    dat=hand_features(train_stances['Headline'],train_stances['ArticleBody'])
    Label_Result['Stance_unrelated']=model.fit(dat,Labels['Stance_unrelated']).predict(hand_features(Test['Headline'],Test['ArticleBody']))
    Label_Result['Stance_disagree']=model.fit(dat,Labels['Stance_disagree']).predict(hand_features(Test['Headline'],Test['ArticleBody']))
    Label_Result['Stance_agree']=model.fit(dat,Labels['Stance_agree']).predict(hand_features(Test['Headline'],Test['ArticleBody']))
    Label_Result['Stance_discuss']=model.fit(dat,Labels['Stance_discuss']).predict(hand_features(Test['Headline'],Test['ArticleBody']))
    return Label_Result
def Predict(train,test,Model=LR):
    X=stance_detect(train,test,Model).stack()
    def strip(text):
        return text[7:]
    return pd.Series(pd.Categorical(X[X!=0].index.get_level_values(1))).apply(strip)

In [14]:
#columns customized
#enter train and test body
train_bodies=pd.read_csv('train_bodies.csv')
train_stances=pd.read_csv('train_stances.csv')
train_bodies.columns=['BodyID','ArticleBody']
train_stances.columns=['Headline','BodyID','Stance']
train_stances=pd.get_dummies(train_stances,prefix=['Stance'],columns=['Stance'])
merger(train_bodies,train_stances)

In [17]:
#Enter Train,Test and Model
#Models-----------------------
#LR  = LOGISTIC REGRESSION
#MNB = MULTINOMIAL NAIVE BAYES
L=len(train_stances)
Submission=Predict(train_stances[round(0.2*L):],train_stances[:round(0.2*L)],LR())

39978it [02:10, 306.51it/s]
9994it [00:32, 304.19it/s]
9994it [00:32, 304.54it/s]
9994it [00:33, 302.36it/s]
9994it [00:33, 299.49it/s]


In [18]:
print(Submission)

0       unrelated
1         discuss
2       unrelated
3       unrelated
4           agree
5         discuss
6       unrelated
7       unrelated
8       unrelated
9       unrelated
10      unrelated
11        discuss
12      unrelated
13        discuss
14      unrelated
15      unrelated
16      unrelated
17        discuss
18      unrelated
19      unrelated
20      unrelated
21      unrelated
22        discuss
23      unrelated
24      unrelated
25      unrelated
26      unrelated
27          agree
28        discuss
29      unrelated
          ...    
9051    unrelated
9052    unrelated
9053    unrelated
9054      discuss
9055    unrelated
9056    unrelated
9057    unrelated
9058    unrelated
9059    unrelated
9060    unrelated
9061      discuss
9062        agree
9063      discuss
9064    unrelated
9065      discuss
9066      discuss
9067    unrelated
9068    unrelated
9069    unrelated
9070      discuss
9071    unrelated
9072    unrelated
9073        agree
9074      discuss
9075    un