# Building features from the processed data after initial exploration

In [1]:
# Importing the required libraries
import numpy as np
import os
import pandas as pd
from ast import literal_eval
# Chaning back to the resource directory if it is not there
if("Res" not in os.getcwd()):
    os.chdir("..")

In [2]:
# Reading the initial data processed from the code initial EDA
intial_df = pd.read_csv("Res/processed_data/final_spam_ham_df.csv",sep=",",
                        converters={"tokens": literal_eval,"useful_tokens": literal_eval,
                                   "stemmed_tokens": literal_eval,"lemma_tokens": literal_eval})
intial_df.head()

Unnamed: 0,id,mail_label,file_name,date_mail,owner_mail,content_message,subject_message,dataset,line_count,tokens,...,single_char_count,number_token_count,year_count,stopword_count,useful_tokens,median_useful_token_len,avg_useful_token_len,rareword_count,stemmed_tokens,lemma_tokens
0,4743,spam,Res/enron1/spam/4743.2005-06-25.GP.spam.txt,2005-06-25 00:00:00,GP,"Subject: what up , , your cam babe\r\nwhat are...","Subject: what up , , your cam babe",enron1,14,"[what, up, ,, ,, your, cam, babe, what, are, y...",...,4,0,0,59.0,"[cam, babe, looking, looking, companion, frien...",5.0,5.7125,46.0,"[cam, babe, look, look, companion, friendship,...","[cam, babe, looking, looking, companion, frien..."
1,1309,spam,Res/enron1/spam/1309.2004-06-08.GP.spam.txt,2004-06-08 00:00:00,GP,Subject: want to make more money ?\r\norder co...,Subject: want to make more money ?,enron1,9,"[want, to, make, more, money, ?, order, confir...",...,1,0,0,20.0,"[want, make, money, order, confirmation, order...",6.0,6.081081,8.0,"[want, make, money, order, confirm, order, shi...","[want, make, money, order, confirmation, order..."
2,726,spam,Res/enron1/spam/0726.2004-03-26.GP.spam.txt,2004-03-26 00:00:00,GP,Subject: food for thoughts\r\n[\r\njoin now - ...,Subject: food for thoughts,enron1,6,"[food, for, thoughts, [, join, now, -, take, a...",...,1,0,0,6.0,"[food, thoughts, join, take, free, tour, click...",4.0,5.0,0.0,"[food, thought, join, take, free, tour, click,...","[food, thought, join, take, free, tour, click,..."
3,202,spam,Res/enron1/spam/0202.2004-01-13.GP.spam.txt,2004-01-13 00:00:00,GP,Subject: miningnews . net newsletter - tuesday...,Subject: miningnews . net newsletter - tuesday...,enron1,97,"[miningnews, ., net, newsletter, -, tuesday, ,...",...,33,58,17,239.0,"[miningnews, net, newsletter, tuesday, january...",6.0,6.264654,165.0,"[miningnew, net, newslett, tuesday, januari, t...","[miningnews, net, newsletter, tuesday, january..."
4,3988,spam,Res/enron1/spam/3988.2005-03-06.GP.spam.txt,2005-03-06 00:00:00,GP,Subject: your pharmacy ta\r\nwould you want ch...,Subject: your pharmacy ta,enron1,2,"[your, pharmacy, ta, would, you, want, cheap, ...",...,0,0,0,2.0,"[pharmacy, ta, would, want, cheap, perscriptio...",4.5,5.2,5.0,"[pharmaci, would, want, cheap, perscript, http...","[pharmacy, would, want, cheap, perscriptions, ..."


In [3]:
all_useful_tokens = []
# Using the lemma tokens as the useful tokens
for useful_token in intial_df["lemma_tokens"]:
    all_useful_tokens.extend(useful_token)
all_useful_tokens = list(map(lambda x: x.lower(), all_useful_tokens))

from collections import Counter
all_useful_tokens_dict = dict(Counter(all_useful_tokens))

# Finding the top 3000 tokens based on frequency of occurance
useful_tokens_dict = {}
for key,value in all_useful_tokens_dict.iteritems():
    if(value>220 and len(key)>2):
        useful_tokens_dict[key.lower()] = value

with open('Res/useful_tokens_dict.txt','w') as data:
    data.write(str(useful_tokens_dict))
    
def intersect_useful_tokens(lemma_tokens_row):
    return (list(set(map(lambda x: x.lower(), lemma_tokens_row))&set(useful_tokens_dict.keys())))
intial_df["attributes"] = intial_df.apply(lambda row: intersect_useful_tokens(row["lemma_tokens"]),axis=1)
intial_df["attributes_len"] = intial_df.apply(lambda row: float(len(row["attributes"])),axis=1)
intial_df["corpus"] = intial_df.apply(lambda row: " ".join(row["attributes"]),axis=1)

In [4]:
# Building the tf-idf data frame
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = list(intial_df["corpus"])
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
attributes_df = pd.DataFrame(X.todense(),columns = vectorizer.get_feature_names())
intial_df = pd.concat([intial_df,attributes_df],axis=1)

In [5]:
# Writing the final features data into attributes_df.csv file
attributes_hot_encode = intial_df
attributes_hot_encode.to_csv("Res/processed_data/attributes_df.csv",sep=",",index=False)
with open("Res/processed_data/attributes_names.txt", 'w') as f:
    for item in vectorizer.get_feature_names():
        f.write("%s\n" % item)

In [6]:
print("Completed, Number of attributes created = {}".format(len(vectorizer.get_feature_names())))

Completed, Number of attributes created = 2938
