Importing the Dependencies

In [120]:
import pickle
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score 

[nltk_data] Downloading package stopwords to /Users/mmm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Collection and Analysis

PIMA Diabetes Dataset

In [121]:
# loading the diabetes dataset to a pandas DataFrame
df_labeled_emails = pd.read_excel('Data_Projet_NLP.xlsx')

In [122]:
# printing the first 5 rows of the dataset
df_labeled_emails.head()

Unnamed: 0,Label,Email
0,Marketing,Subject: Introducing a New Social Media Campai...
1,Marketing,Subject: Proposal for Influencer Partnership\n...
2,Marketing,Subject: New Content Marketing Strategy\n\nDea...
3,Marketing,"Dear Chris,\n\nI wanted to share with you a co..."
4,Marketing,"Subject: New Branding Strategy\n\nDear Chris,\..."


In [123]:
# number of rows and Columns in this dataset
df_labeled_emails.shape

(207, 2)

In [124]:
df_labeled_emails['Label'].value_counts()

Label
IT                57
Marketing         53
Sales             35
Accountability    33
HR                29
Name: count, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [125]:
# separating the data and labels
df_emails = df_labeled_emails['Email']
df_target = df_labeled_emails['Label']

In [126]:
#Context words
contex_words = ['subject','best','regards','your name','recipient','dear', 'please','name', 'thank', 'email']

#Stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopWords = set(stopwords)
filename_stopwords = 'stopwords.sav'
pickle.dump(stopWords, open(filename_stopwords, 'wb'))

#Tokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

# instanciate the Stemmer
ps = PorterStemmer()

In [127]:
# preprocessing of the emails
def preprocessing_email(email):
    # splits the sentences in words keeping alphabatic values only
    words = tokenizer.tokenize(email)
    
    wordsFiltered = []
    for w in words:
        # lowercase words
        w = w.lower()
        # retrieve stopwords and context words
        if w not in stopWords and w not in contex_words:
            # Stemming of the filtred words
            wordsFiltered.append(ps.stem(w))

    return " ".join(wordsFiltered)

# preprocessing of the corpus of emails
preprocessed_email_corpus = []

for email in df_emails:
    preprocessed_email = preprocessing_email(email)
    preprocessed_email_corpus.append(preprocessed_email)

print('Preprocessed email corpus:\n', preprocessed_email_corpus)

Preprocessed email corpus:
 ['introduc new social media campaign want share new social media campaign market team work campaign design increas engag awar social media channel drive traffic websit campaign includ seri short video showcas product servic action well target advertis strategi reach key audienc segment also leverag user gener content increas authent reach campaign believ campaign potenti gener signific result compani excit see perform let know question feedback harri market team', 'propos influenc partnership want share propos influenc partnership believ would great fit brand influenc question larg engag follow target market believ endors could help us reach wider audienc build brand awar propos partnership would involv seri sponsor post stori influenc social media channel well dedic land page websit exclus offer follow believ partnership would win win compani influenc excit potenti result let know question anyth els would like us consid harri market team', 'new content mark

In [128]:
# TF-IDF to create the embedding/ feaure space
# initialize the vectorizer
vectorizer = TfidfVectorizer()
# fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(preprocessed_email_corpus)

# print the dataframe
feature_names = vectorizer.get_feature_names_out()
print('\nNumber of features:', len(feature_names))
print("\nTF-IDF MATRIX:\n\n", pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names))


Number of features: 1169

TF-IDF MATRIX:

      abl  absolut  accept  access  accommod  accomplish  accordingli  account   
0    0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0  \
1    0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
2    0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
3    0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
4    0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
..   ...      ...     ...     ...       ...         ...          ...      ...   
202  0.0      0.0     0.0     0.0       0.0         0.0     0.126746      0.0   
203  0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
204  0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
205  0.0      0.0     0.0     0.0       0.0         0.0     0.000000      0.0   
206  0.0      0.0     0.0     0.0       0.0         0.0     0.000

Train Test Split

In [129]:

# Generate stratified Train/Test folds (The folds are made by preserving the percentage of samples for each class)

def train_test_stratified_split(X, y):
  sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
  print(sss)
  for _, (train_index, test_index) in enumerate(sss.split(X, y)):
      X_train = X[train_index]
      X_test = X[test_index]
      y_train = y[train_index]
      y_test = y[test_index]
  
  return X_train, y_train, X_test, y_test

X = tfidf_matrix.toarray()
y = df_target

X_train, y_train, X_test, y_test = train_test_stratified_split(X, y)

StratifiedShuffleSplit(n_splits=1, random_state=None, test_size=0.2,
            train_size=None)


In [130]:
print(X.shape, X_train.shape, X_test.shape)

(207, 1169) (165, 1169) (42, 1169)


Training the Model

In [131]:
svm_clf = svm.SVC()
   # we use GridSearch to find the best parameters for the classifier
C_array = np.linspace(0.1, 10.0, num=10)

parametres = {
                  'C': C_array,
                  'kernel': ['rbf', 'linear','poly'],
                  'gamma':[0.001, 0.1, 0.5]
                 }
grid_svm = GridSearchCV(svm_clf, param_grid=parametres)

In [132]:
#training the support vector Machine Classifier
grid_svm.fit(X, y)
print('Best params:', grid_svm.best_params_)
# use the best classifier found by GridSearchCV
best_svm_clf = grid_svm.best_estimator_

Best params: {'C': 1.2000000000000002, 'gamma': 0.001, 'kernel': 'linear'}


Model Evaluation

Accuracy Score

In [133]:
# use cross-validation to evaluate the best classifier
cv_svm = StratifiedKFold(n_splits=5, shuffle=True)
y_pred_svm = cross_val_predict(best_svm_clf, X, y, cv=cv_svm)
accuracy = accuracy_score(y, y_pred_svm)
f1_score = f1_score(y, y_pred_svm, average='weighted')

In [134]:
print('Accuracy score of the test data : ', accuracy)
print('F1 score of the test data : ', f1_score)

Accuracy score of the test data :  0.9130434782608695
F1 score of the test data :  0.9131760815235819


Saving the trained model

In [135]:
filename_clf = 'email_classification_model.sav'
pickle.dump(best_svm_clf, open(filename_clf, 'wb'))

filename_vec = 'email_vectorizer.sav'
pickle.dump(vectorizer, open(filename_vec, 'wb'))

In [136]:
# loading the saved model
loaded_model = pickle.load(open('email_classification_model.sav', 'rb'))

In [137]:
raw_input_data = "Subject: Invitation to Attend a Job Interview\
Dear [Applicant Name],\
Thank you for applying for the [Job Title] position with our company. We were impressed with your qualifications and experience, and we would like to invite you to attend an interview with our team.\
The interview will take place on [Date] at [Time] at our office located at [Address]. During the interview, we will discuss your qualifications, experience, and your fit with our company culture. You will also have the opportunity to ask any questions you may have about the position and the company.\
Please let us know if this date and time work for you. If not, we are happy to arrange a different time that is convenient for you. Also, please bring a copy of your updated resume and any relevant certifications or documents.\
We would like to remind you that the interview is an important part of our selection process, and we ask that you dress appropriately and arrive on time.\
We look forward to meeting with you and getting to know you better.\
Best regards,"

preprocessed_input_data = []
preprocessed_input_data.append(preprocessing_email(raw_input_data))

preprocessed_input_data_embedding = vectorizer.transform(preprocessed_input_data)

input_data_as_numpy_array = preprocessed_input_data_embedding.toarray()

prediction = loaded_model.predict(input_data_as_numpy_array)
print(prediction[0])

HR
