In [1]:
# Import the libraries 
import nltk 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import numpy as np 
import gensim
from gensim.models import Word2Vec
import pandas as pd 

In [3]:
#Loading the dataset 
df_data = pd.read_csv("E:\Tarang\Ashoka\Python\PYTHON PROJECT\PM_Modi_Speech_Text_english.csv")   

In [4]:
df_data

Unnamed: 0.1,Unnamed: 0,Date,Speech,Title
0,2,09 JAN 2017,Chief Minister of Gujarat Shri Vijay Rupani ji...,Text of PM's address at the Inauguration of No...
1,3,09 JAN 2017,I am delighted to be here at Gift City to ina...,Text of PM’s address on the occasion of Inaugu...
2,5,03 JAN 2017,"Governor of Andhra Pradesh, Shri E. S. L. Nara...",PM's Address at the Inauguration of the 104th ...
3,6,26 FEB 2017,"My dear countrymen, Namaskar. Winter is on its...",English Translation of the text of PM’s ‘Mann ...
4,8,21 FEB 2017,Namaskaram. Greetings to everyone. Swami Nirvi...,Text of PM’s inaugural address (via video conf...
...,...,...,...,...
1177,1263,08 APR 2023,Bharat Mata Ki Jai!\n\nBharat Mata Ki Jai!\n\n...,English rendering of PM’s address at launch of...
1178,1264,04 APR 2023,ExcellenciesHonourable Minister Mr. Harbers;Sp...,"Text of Address by Dr. P.K. Mishra, Principal ..."
1179,1265,04 APR 2023,"Namaskar!Excellencies, heads of state, Academi...",Text of PM’s remarks at International Conferen...
1180,1266,03 APR 2023,My colleague in the Union Cabinet Dr. Jitendra...,English rendering of PM’S address at the diamo...


In [5]:
df_data = df_data.drop("Unnamed: 0", axis = 1)

In order to sort the dates, firstly changing the format of the date. The code below defines a function to convert the date strings in the Date column of a Pandas DataFrame to a datetime format.

In [6]:
from datetime import datetime
month_dict = {
    "JAN": "01",
    "FEB": "02",
    "MAR": "03",
    "APR": "04",
    "MAY": "05",
    "JUN": "06",
    "JUL": "07",
    "AUG": "08",
    "SEP": "09",
    "OCT": "10",
    "NOV": "11",
    "DEC": "12"
}

def dates(x):
    dt = datetime.strptime(x, "%d %b %Y")
    return dt
# It then applies this function to the Date column and sorts the DataFrame by date in ascending order. 
df_data['Date'] = df_data['Date'].apply(dates)
df_data = df_data.sort_values('Date')


In [7]:
df_data

Unnamed: 0,Date,Speech,Title
2,2017-01-03,"Governor of Andhra Pradesh, Shri E. S. L. Nara...",PM's Address at the Inauguration of the 104th ...
0,2017-01-09,Chief Minister of Gujarat Shri Vijay Rupani ji...,Text of PM's address at the Inauguration of No...
1,2017-01-09,I am delighted to be here at Gift City to ina...,Text of PM’s address on the occasion of Inaugu...
6,2017-02-01,I congratulate the Finance Minister Arun Jaitl...,English rendering of the Prime Minister’s Stat...
5,2017-02-05,Sri Pejavar Math’s most respected Sri Vishwesh...,English rendering of the text of PM’s address ...
...,...,...,...
1175,2023-04-09,My colleagues in the Union Cabinet Shri Bhupen...,English rendering of PM’s address at the inaug...
1173,2023-04-12,Namaskar! \n\nThe campaign to provide governme...,English rendering of PM’s address at training ...
1174,2023-04-12,"Namaskar, Governor of Rajasthan Shri Kalraj Mi...",English rendering of PM’s address during flagg...
1172,2023-04-14,"Governor of Assam Shri Gulab Chand Kataria ji,...",English rendering of PM’s address at inaugurat...


### Indexing

In [8]:
#changing the index
new_index = ['NMS' + str(i) for i in range(1, len(df_data)+1)]
#renaming to reflect on df
df_data = df_data.rename(index=dict(zip(df_data.index, new_index)))
df_data

Unnamed: 0,Date,Speech,Title
NMS1,2017-01-03,"Governor of Andhra Pradesh, Shri E. S. L. Nara...",PM's Address at the Inauguration of the 104th ...
NMS2,2017-01-09,Chief Minister of Gujarat Shri Vijay Rupani ji...,Text of PM's address at the Inauguration of No...
NMS3,2017-01-09,I am delighted to be here at Gift City to ina...,Text of PM’s address on the occasion of Inaugu...
NMS4,2017-02-01,I congratulate the Finance Minister Arun Jaitl...,English rendering of the Prime Minister’s Stat...
NMS5,2017-02-05,Sri Pejavar Math’s most respected Sri Vishwesh...,English rendering of the text of PM’s address ...
...,...,...,...
NMS1178,2023-04-09,My colleagues in the Union Cabinet Shri Bhupen...,English rendering of PM’s address at the inaug...
NMS1179,2023-04-12,Namaskar! \n\nThe campaign to provide governme...,English rendering of PM’s address at training ...
NMS1180,2023-04-12,"Namaskar, Governor of Rajasthan Shri Kalraj Mi...",English rendering of PM’s address during flagg...
NMS1181,2023-04-14,"Governor of Assam Shri Gulab Chand Kataria ji,...",English rendering of PM’s address at inaugurat...


In [9]:
# Converts the Speech column to a string data type.
df_data['Speech'] = df_data['Speech'].astype(str)

In [10]:
# Initializing the model 
lemmatizer = WordNetLemmatizer()

#### Cleaning the Speeches
We define a function to clean speeches. Cleaning involves tokenisation, lemmatization, removing stopwords and special characters and finally converting speech into lower case. 

In [11]:
def cleaned(speech):
    # Sentence Tokenization
    sentence = nltk.sent_tokenize(speech)

    # Creating an empty list to store the cleaned dataset 
    corpus = []

    # Loop through each sentence in the speech
    for i in range(len(sentence)):
        # Remove any non-alphabetic characters in the sentence and convert to lowercase
        review = re.sub("[^a-zA-Z0-9]", " ", sentence[i])
        review = review.lower()
        
        # Tokenize the sentence into a list of words
        review = review.split()
        
        # Lemmatize each word in the sentence and remove stop words
        lemmatizer = WordNetLemmatizer()
        review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
        
        # Convert the list of words back into a sentence
        review = ' '.join(review)
        
        # Add the cleaned sentence to the corpus
        corpus.append(review)

    return corpus[:-1] # Removing the last element of the list as it is unnecessary 


# To convert cleaned speeches into string 
def string(speech):
    string_speech = " ".join(speech)
    return string_speech


In [None]:
# Clean the text data in the 'Speech' column of the dataframe using the 'cleaned' function
# and create a new column 'Cleaned_Speech' to store the cleaned text data
df_data['Cleaned_Speech'] = df_data['Speech'].apply(cleaned)

# Convert the cleaned text data in the 'Cleaned_Speech' column to a string 
# using the 'string' function and create a new column 'Cleaned_Speech_string'
# to store the string data
df_data['Cleaned_Speech_string'] = df_data['Cleaned_Speech'].apply(string)

In [38]:
df_data = df_data.reset_index()

In [39]:
df_data

Unnamed: 0,index,Date,Speech,Title,Cleaned_Speech,Cleaned_Speech_string
0,NMS1,2017-01-03,"Governor of Andhra Pradesh, Shri E. S. L. Nara...",PM's Address at the Inauguration of the 104th ...,[governor andhra pradesh shri e l narasimhan c...,governor andhra pradesh shri e l narasimhan ch...
1,NMS2,2017-01-09,Chief Minister of Gujarat Shri Vijay Rupani ji...,Text of PM's address at the Inauguration of No...,[chief minister gujarat shri vijay rupani ji c...,chief minister gujarat shri vijay rupani ji co...
2,NMS3,2017-01-09,I am delighted to be here at Gift City to ina...,Text of PM’s address on the occasion of Inaugu...,[delighted gift city inaugurate india first in...,delighted gift city inaugurate india first int...
3,NMS4,2017-02-01,I congratulate the Finance Minister Arun Jaitl...,English rendering of the Prime Minister’s Stat...,[congratulate finance minister arun jaitley je...,congratulate finance minister arun jaitley jee...
4,NMS5,2017-02-05,Sri Pejavar Math’s most respected Sri Vishwesh...,English rendering of the text of PM’s address ...,[sri pejavar math respected sri vishwesh tirth...,sri pejavar math respected sri vishwesh tirth ...
...,...,...,...,...,...,...
1177,NMS1178,2023-04-09,My colleagues in the Union Cabinet Shri Bhupen...,English rendering of PM’s address at the inaug...,[colleague union cabinet shri bhupender yadav ...,colleague union cabinet shri bhupender yadav j...
1178,NMS1179,2023-04-12,Namaskar! \n\nThe campaign to provide governme...,English rendering of PM’s address at training ...,"[namaskar, campaign provide government job you...",namaskar campaign provide government job youth...
1179,NMS1180,2023-04-12,"Namaskar, Governor of Rajasthan Shri Kalraj Mi...",English rendering of PM’s address during flagg...,[namaskar governor rajasthan shri kalraj mishr...,namaskar governor rajasthan shri kalraj mishra...
1180,NMS1181,2023-04-14,"Governor of Assam Shri Gulab Chand Kataria ji,...",English rendering of PM’s address at inaugurat...,[governor assam shri gulab chand kataria ji ch...,governor assam shri gulab chand kataria ji chi...


In [40]:
# Saving the cleaned data to the csv file 
df_data.to_csv("C:/Users/HP/Downloads/PM_Modi_Speech_Cleaned_string.csv", index = False)