In [1]:
from myfunctions import *
from sklearn.feature_extraction.text import CountVectorizer
from deep_pytorch import *
from sklearn.preprocessing import StandardScaler
import joblib



In [2]:
data_full = pd.read_json('fake_news.json', lines=True)
data_full = data_full.drop(columns=['article_link']) # remove link column
df_train_f, df_test = split_dataframe(data_full, test_size=0.25, seed=1509)
df_train, df_validate = split_dataframe(df_train_f, test_size=0.2, seed=1309)

# Proportion of each subsets
list_label = df_train['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TRAINING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_validate['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== VALIDATING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_test['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TESTING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

===== TRAINING SAMPLES =====
Total Sample: 18316
Sarcastic: 8726 (47.64%)
Not Sarcastic: 9590 (52.36%)
===== VALIDATING SAMPLES =====
Total Sample: 4579
Sarcastic: 2181 (47.63%)
Not Sarcastic: 2398 (52.37%)
===== TESTING SAMPLES =====
Total Sample: 5724
Sarcastic: 2727 (47.64%)
Not Sarcastic: 2997 (52.36%)


In [3]:
data_train = df_train
data_train['headline_s1'] = data_train.headline.apply(lambda row: remove_symbol(row))
data_train['headline_s2'] = data_train.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train['headline_s2'] = data_train.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
data_train['headline_s3'] = data_train.headline_s2.apply(lambda row: remove_stop_words(row))

data_val = df_validate
data_val['headline_s1'] = data_val.headline.apply(lambda row: remove_symbol(row))
data_val['headline_s2'] = data_val.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_val['headline_s2'] = data_val.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
data_val['headline_s3'] = data_val.headline_s2.apply(lambda row: remove_stop_words(row))

data_val_rmsw = data_val.copy()
data_val_rmsw = data_val_rmsw.drop(columns=['headline', 'headline_s1', 'headline_s2'])
                                   
data_train_rmsw = data_train.copy()
data_train_rmsw = data_train_rmsw.drop(columns=['headline', 'headline_s1', 'headline_s2'])

data_train = data_train.drop(columns=['headline', 'headline_s1', 'headline_s3'])
data_val = data_val.drop(columns=['headline', 'headline_s1', 'headline_s3'])

In [4]:
def most_common_words(sent, numb_words=20):
    words = sent.split()
    wordCount = Counter(words)
    wordCount = wordCount.most_common()
    if numb_words > len(wordCount) or numb_words < 0:
        numb_words = len(wordCount)
    top_words = [x[0] for x in wordCount[:numb_words]]
    count_words = [x[1] for x in wordCount[:numb_words]]
    return top_words, count_words

all_string = data_train_rmsw.headline_s3.tolist()
all_string_in_one = ' '.join(all_string)
list_common_words, count_words = most_common_words(all_string_in_one, numb_words=-1)

cwdf = pd.DataFrame(np.asarray(count_words),
                    columns=['count_words'])
cwdf['words'] = list_common_words

In [7]:
cwdf[9355:]

Unnamed: 0,count_words,words
9355,1,taster
9356,1,neyo
9357,1,russialinked
9358,1,oldperson
9359,1,awkwardness
...,...,...
18612,1,tamper
18613,1,woodward
18614,1,arthamptons
18615,1,appelhof


In [6]:
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
print(f"Number of unique words remaining: {s}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 9355
Total discard (Unknown Token): 9262


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# tokenize and build vocab
all_string = data_train.headline_s2.tolist()
vectorizer.fit(all_string)

TfidfVectorizer()

In [24]:
# summarize
print(len(vectorizer.vocabulary_))
print(vectorizer.idf_)

18337
[10.12243769 10.12243769  5.9953033  ... 10.12243769 10.12243769
 10.12243769]


In [32]:
vector = vectorizer.transform([data_train.headline_s2[0]])
vector = vector.toarray()

In [40]:
vector[0][10:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [8]:
cwdf.describe()

Unnamed: 0,count_words
count,18617.0
mean,7.138959
std,25.460874
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,1128.0


In [13]:
cwdf.quantile(.05, axis = 0) 

count_words    1.0
Name: 0.05, dtype: float64

In [15]:
cwdf.quantile(.95, axis = 0) 

count_words    29.0
Name: 0.95, dtype: float64

In [14]:
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
print(f"Number of unique words remaining: {s}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 9355
Total discard (Unknown Token): 9262


In [16]:
s = cwdf.index[cwdf.iloc[:,0] == 29].tolist()[0]
print(f"Number of unique words remaining: {s}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 907
Total discard (Unknown Token): 61719


In [18]:
cwdf.head(10)

Unnamed: 0,count_words,words
0,1128,trump
1,1040,new
2,958,man
3,653,get
4,609,woman
5,522,make
6,488,say
7,467,report
8,366,u
9,362,time
