## Machine Learning internship
## Task:
### Example:
  1. User applies for a job and receive a confirmation email.
  2. The subject of the email contains the keywords "Thank you for applying".
  3. User applies for n number of jobs and receives n number of emails, subject containing the keywords "Thank you for applying".
  4. Filter out all the emails received after applying for a job.



In [1]:
#importing the required libraries:
import numpy as np
import pandas as pd

In [2]:
#Python library for reading IMAP mailboxes and converting the email content to human readable data
!pip install mailbox



In [3]:
#loading the dataset
#converting gmail content to mbox details can be referred from https://spinbackup.com/blog/how-to-import-mbox-to-gmail/
#the file will be data.mbox but I have downloaded only inbox content thus my filename is Inbox.mbox

import mailbox
mboxfile = "Inbox.mbox"

inbox_mbox = mailbox.mbox(mboxfile)

In [4]:
#checking the list of available keys in inbox_mbox
for key in inbox_mbox[0].keys():
    print(key)

X-GM-THRID
X-Gmail-Labels
Delivered-To
Received
X-Received
ARC-Seal
ARC-Message-Signature
ARC-Authentication-Results
Return-Path
Received
Received-SPF
Authentication-Results
DKIM-Signature
X-Google-DKIM-Signature
X-Gm-Message-State
X-Google-Smtp-Source
X-Received
MIME-Version
From
Date
Message-ID
Subject
To
Content-Type


In [5]:
import csv
#creating a csv file with only the required attributes:
with open('inbox.csv', 'w') as outputfile:
    writer = csv.writer(outputfile)
    writer.writerow(['subject','from','date'])
    for message in inbox_mbox:
        writer.writerow([message['Subject'], message['From'],  message['Date']])

In [6]:
#reading the csv file, our dataset
emaildata = pd.read_csv('inbox.csv')

In [None]:
#first 5 rows content of the dataset
emaildata.head()

In [8]:
#getting brief overview of the dataset - number of columns and rows (shape of dataset), columns names and its dtype, how many non-null values it has and memory usage
emaildata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 928 entries, 0 to 927
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  928 non-null    object
 1   from     928 non-null    object
 2   date     928 non-null    object
dtypes: object(3)
memory usage: 21.9+ KB


In [9]:
#converting date column from object to DateTime
emaildata['date'] = emaildata['date'].apply(lambda x: pd.to_datetime(x, errors='coerce', utc=True))

In [10]:
#extracting the date only
emaildata['date'] = emaildata['date'].dt.date

In [11]:
#getting brief overview of the dataset - number of columns and rows (shape of dataset), columns names and its dtype, how many non-null values it has and memory usage
emaildata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 928 entries, 0 to 927
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  928 non-null    object
 1   from     928 non-null    object
 2   date     928 non-null    object
dtypes: object(3)
memory usage: 21.9+ KB


In [12]:
#converting date column from object to DateTime
emaildata['date'] = emaildata['date'].astype('datetime64[ns]')

In [13]:
#getting brief overview of the dataset - number of columns and rows (shape of dataset), columns names and its dtype, how many non-null values it has and memory usage
emaildata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 928 entries, 0 to 927
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   subject  928 non-null    object        
 1   from     928 non-null    object        
 2   date     928 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 21.9+ KB


In [None]:
#first 5 rows content of the dataset
emaildata.head()

In [15]:

job_category_lst = [] #a full list containing subject, from, date of emails having 'Thank you for applying' keywords in email subject
job_category_dict = {} #a dictionary of different job categories counts from emails having 'Thank you for applying' keywords in email subject
for index, row in emaildata.iterrows():
  if 'Thank you for applying' in row['subject']:
    innerlst = [row['subject'], row['from'], row['date']]
    job_category_lst.append(innerlst)
    if 'AI Intern' in row['subject']:
      if 'AI Intern' in job_category_dict:
        job_category_dict['AI Intern'] += 1
      else:
        job_category_dict['AI Intern'] = 1
    if 'Data Scientist Intern' in row['subject']:
      if 'Data Scientist Intern' in job_category_dict:
        job_category_dict['Data Scientist Intern'] += 1
      else:
        job_category_dict['Data Scientist Intern'] = 1
    if 'Machine Learning / Deep Learning Intern' in row['subject']:
      if 'Machine Learning / Deep Learning Intern' in job_category_dict:
        job_category_dict['Machine Learning / Deep Learning Intern'] += 1
      else:
        job_category_dict['Machine Learning / Deep Learning Intern'] = 1

In [None]:
#a full list containing subject, from, date of emails having 'Thank you for applying' keywords in email subject
job_category_lst

In [17]:
#a dictionary of different job categories counts from emails having 'Thank you for applying' keywords in email subject
job_category_dict

{'AI Intern': 1,
 'Data Scientist Intern': 1,
 'Machine Learning / Deep Learning Intern': 1}