In [1]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
import re
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("ham_spam.csv", encoding = 'latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""",GE,"GNT:-)"""
freq,4825,30,3,2,2


In [4]:
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)

In [5]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
def return_words(row):
    
    # consistent casing
    row = row.lower()
    
    # # Tokenization
    row = re.sub('[^A-Za-z0-9\s]+', '', row)
    words = word_tokenize(row)
    
    # Removing Common words - stop words
    clean_list = []
    stop_words = stopwords.words('english')
    stop_words.append(["etc", "also"])
    for word in words:
        if word not in stop_words:
            clean_list.append(word)
    
    # # Stemming - Using this one - the below ones are just for reference:
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    words = []
    for word in clean_list:
        w = lemmatizer.lemmatize(word,pos='a')
        if w == word:
            w = lemmatizer.lemmatize(w,pos='v')
        if w == word:
            w = lemmatizer.lemmatize(w,pos='n')
        if (w == word) and (len(w)) > 3:
            w = stemmer.stem(w)
        words.append(w)
        
    words = list(set(words))

    return words
    

In [7]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
def test_lambda(row):
    row = row.split(" ")
    return row

In [9]:
df['words'] = df['v2'].apply(lambda x: return_words(x))

In [10]:
df.drop("test",axis=1)

KeyError: "['test'] not found in axis"

In [None]:
def bag_of_words(df):
    bag_of_words = []
    for index, row in df.iterrows():
        bag_of_words = bag_of_words + row['words']
    bag_of_words = list(set(bag_of_words))
    return bag_of_words

In [None]:
bag_of_words = bag_of_words(df)

In [None]:
len(bag_of_words)

In [None]:
column_names = ["mail_id"] + bag_of_words

In [None]:
df2 = pd.DataFrame(columns= column_names)

In [None]:
df2

In [None]:
import workers
output = workers.mapping_parallelize(df[:5],bag_of_words)
output

In [None]:
def mapping(df1,df2,column_names):
    for index, row in df1.iterrows():
        print("Processing mail id: ", index)
        # Adding mail id
        df2_dict = dict.fromkeys(column_names,[0])
        df2_dict['mail_id'] = index
        # Populating the words columns
        for word in row['words']:
            if word in bag_of_words:
                df2_dict[word] = 1
        
        df2_row = pd.DataFrame.from_dict(df2_dict)
        df2 = df2.append(df2_row)
    return df2

In [None]:
output = mapping(df[:500],df2,column_names)
output

In [None]:
output.describe()


In [None]:
output.shape

In [None]:
X = output
y = df['v1'][:500]

In [None]:
y = y.str.replace("ham",'0').str.replace("spam",'1')
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
model.score(X_train, y_train)

In [None]:
def mapping_parallelize(df1):
    column_names = ["mail_id"] + bag_of_words
    df2 = pd.DataFrame(columns= column_names)
    for index, row in df1.iterrows():
        print("Processing mail id: ", index)
        # Adding mail id
        df2_dict = dict.fromkeys(column_names,[0])
        df2_dict['mail_id'] = index
        # Populating the words columns
        for word in row['words']:
            if word in bag_of_words:
                df2_dict[word] = 1
        
        df2_row = pd.DataFrame.from_dict(df2_dict)
        df2 = df2.append(df2_row)
    return df2

In [None]:
output = mapping_parallelize(df[:5])
output

In [None]:
import workers

In [None]:
import workers

#import Pool
from multiprocessing import Pool

#Assuming you want to use 3 processors
num_processors = 3

#Create a pool of processors
p=Pool(processes = num_processors)

#get them to work in parallel
# x = [1,2,3]
# output = p.map(workers.test,x)
# output

# Applying for out function:
if __name__ ==  '__main__': 
    num_processors = 3
    p=Pool(processes = num_processors)
    output = p.map(workers.mapping_parallelize,df[:5])


In [None]:
import workers as w
output = w.mapping_parallelize(df[:5])

In [None]:
import test
x = [1,2,3]
test.test(x)

In [None]:
output[output['live'] == 1]

In [None]:
sample = pd.DataFrame(columns=['words'])
sample

In [None]:
sample.append({'words':['hanger','hsbc','miss'])

In [None]:
a = ['a','b','c']
d = dict.fromkeys(a,[0])
d

In [None]:
d = pd.DataFrame.from_dict(d)
d

In [None]:
A = ['a','b','c']
D = dict.fromkeys(a,[0])
D = pd.DataFrame.from_dict(d)
D

In [None]:
D.append(d)

In [None]:
X = pd.DataFrame(columns=['a','b','c'])
X

In [None]:
X.append(d)