In [75]:
from nltk.classify.util import apply_features
from nltk import NaiveBayesClassifier
import pandas as pd
import pickle
import collections
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize


class SpamClassifier:
    
    def extract_tokens(self, text, target):
        """returns array of tuples where each tuple is defined by (tokenized_text, label)
         parameters:
                text: array of texts
                target: array of target labels
                
        NOTE: consider only those words which have all alphabets and atleast 3 characters.
        """
        l1 = []
        for i in text:
            t = [i1.lower() for i1 in i.split() if i1.isalpha() == True]
            l1.extend(t)
            
            

        corpus= [(i, j) for i, j in zip(l1, target) if (len(i) >= 3) ]
        return corpus
        
    
    def get_features(self, corpus):
        """ 
        returns a Set of unique words in complete corpus. 
        parameters:- corpus: tokenized corpus along with target labels (i.e)the ouput of extract_tokens function.
        
        Return Type is a set
        """
#         print(set(corpus))
        return set(corpus)        
    
    def extract_features(self, document):
        """
        maps each input text into feature vector
        parameters:- document: string
        
        Return type : A dictionary with keys being the train data set word features.
                      The values correspond to True or False
        """
        features={}
        doc_words = set(document)
        #iterate through the word_features to find if the doc_words contains it or not
        for word in self.word_features:
            features[word] = (word in doc_words)
            
        return features
        

    def train(self, text, labels):
        """
        Returns trained model and set of unique words in training data
        also set trained model to 'self.classifier' variable and set of 
        unique words to 'self.word_features' variable.
        """
        #call extract_tokens
        self.corpus= self.extract_tokens(text, labels)

        #call get_features
        self.word_features= self.get_features(self.corpus)

        #Extracting training set
        train_set = nltk.classify.util.apply_features(self.extract_features, self.corpus)
    
        #Now train the NaiveBayesClassifier with train_set
        self.classifier = nltk.classify.NaiveBayesClassifier.train(train_set)
        
        return self.classifier, self.word_features
       
    
    def predict(self, text):
        """
        Returns prediction labels of given input text. 
        Allowed Text can be a simple string i.e one input email, a list of emails, or a dictionary of emails identified by their labels.
        """
        if isinstance(text, (list)):
            pred = []
            for sentence in list(text):
                pred.append(self.classifier.classify(self.extract_features(sentence.split())))
            return pred
        if isinstance(text, (collections.OrderedDict)):
            pred = collections.OrderedDict()
            for label, sentence in text.items():
                pred[label] = self.classifier.classify(self.extract_features(sentence.split()))
            return pred
        return self.classifier.classify(self.extract_features(text.split()))


        
    
    
if __name__ == '__main__':
    
    data = pd.read_csv('email.csv')
    train_X, test_X, train_Y, test_Y = train_test_split(data["text"].values,
                                                            data["spam"].values,
                                                            test_size = 0.25,
                                                            random_state = 50,
                                                            shuffle = True,
                                                            stratify=data["spam"].values)
    classifier = SpamClassifier()
    classifier_model, model_word_features = classifier.train(train_X, train_Y)
    model_name = 'spam_classifier_model.pk'
    model_word_features_name = 'spam_classifier_model_word_features.pk'
    with open(model_name, 'wb') as model_fp:
        pickle.dump(classifier_model, model_fp)
    with open(model_word_features_name, 'wb') as model_fp:
            pickle.dump(model_word_features, model_fp)
    print('DONE')
    

DONE


In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("email.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: this free 7 - day trial will prove th...,1
1,"Subject: followup from iris mack hi , thank ...",0
2,Subject: make your rivals envy lt is really h...,1
3,Subject: re : telephone interview with the enr...,0
4,Subject: a 1 time charge add your property / s...,1


In [22]:
def extract_features(document):
        """
        maps each input text into feature vector
        parameters:- document: string
        
        Return type : A dictionary with keys being the train data set word features.
                      The values correspond to True or False
        """
#         assert self.word_features
        features={}
        doc_words = set(document)
        #iterate through the word_features to find if the doc_words contains it or not
        for word in ["a", "an", "the", "youuu"]:
            features['contains(%s)' % word] = (word in doc_words)
            
        return features

In [23]:
extract_features(["a", "an", "the", "you"])

{'contains(a)': True,
 'contains(an)': True,
 'contains(the)': True,
 'contains(youuu)': False}

In [45]:
 def extract_tokens(text, target):
        """returns array of tuples where each tuple is defined by (tokenized_text, label)
         parameters:
                text: array of texts
                target: array of target labels
                
        NOTE: consider only those words which have all alphabets and atleast 3 characters.
        """
        corpus= [(i.lower(), j) for i, j in zip(text, target) if (len(str(i)) >= 3 and i.isalpha() == True) ]
      
        return corpus

In [49]:
extract_tokens(["abfkkjs", "an", "The", "132131651", "12334566"], [1, 0, 1, 0,1])

[('abfkkjs', 1), ('the', 1)]

NameError: name 'document_words' is not defined