In [1]:
# Importing important libraries and modules

import re
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
import nltk
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.naive_bayes import GaussianNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# base_url tends to be fixed if there are other links in the webpage
base_url = 'https://www.mygov.in/'

# next_page part of the url keeps on changing
next_page = base_url + '/group-issue/share-your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/'

# lists that will store the requisite data
suggestions = []
username = []
date = []

# Maximum number of suggestions we are loading
max_suggestions = 60

while next_page and len(suggestions) < max_suggestions : 
    
    # Downloading the webpage
    page = requests.get(next_page)
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())
    
    # Getting the content of the requisite tags
    s = [elem.get_text() for elem in soup.select('div.comment_body')]
    d = [elem.get_text() for elem in soup.select('span.date_time')]
    u = [elem.get_text() for elem in soup.select('span.username')]
    
    # Filling the lists with the requisite values
    for item in s :
        suggestions.append(item)    
    for item in u :
        username.append(item)
    for item in d :
        date.append(item)

    # Checking for the click of 'View More' button on the webpage    
    next_page = soup.find('li', class_='pager-next first last')
    
    # If the button is clicked, BeautifulSoup extracts the link value (href)
    if next_page : 
        next_page = base_url + next_page.find('a').get('href')
        
    # Extra - for testing     
    print('comments: {}'.format(len(suggestions)))


comments: 10
comments: 20
comments: 30
comments: 40
comments: 50
comments: 60


In [3]:
# Creating a Dataframe ( table ) to store the above extracted list values

column_headers = ["Username","Suggestion","Made On","Category"]
df = pd.DataFrame(columns=column_headers)

# Converting the lists into Series objects **Dataframes store Series objects only**

u = pd.Series(username)
d = pd.Series(date)
s = pd.Series(suggestions)

# Inserting the converted lists as columns in the Dataframe

df["Username"] = u.values
df["Suggestion"] = s.values
df["Made On"] = d.values

# Manually adding the column for Categories
category = ["Medical","Traffic","Education","Aadhar","Maritime","Others","Others","Others","Others","Others","Maritime","Safety","Energy","Banking","Banking","Bribery","Safety","Safety","Different","Safety","Others","Others","Cleanliness","Others","Education","Others","Others","Education","Others","Education","Others","Safety","Job","Law","Achievement","Banking","Law","Technology","Safety","Development",
           "Others","Others","Safety","Education","Black Money","Education","Nature","Others","Pension","Education","Jobs","Energy","Law","Pension","Pension","Health","Traffic","Others","Others","Law","Job"]
df["Category"] = pd.Series(category)
df

Unnamed: 0,Username,Suggestion,Made On,Category
0,Hari Mohan Gupta,Sir my humble submission is that please ask pu...,1 month 3 days ago,Medical
1,paras shah,Hello\n Sir.... Mera AK idea hai Jese bus tic...,1 month 3 days ago,Traffic
2,Anu Verma,"Respect sir,\nI am Hindi teacher in one of the...",1 month 3 days ago,Education
3,Jayesh Kulkarni,Sir\nsuggestions AADHAR BASE SYSTEM\n1.Cash Le...,1 month 3 days ago,Aadhar
4,AKASH GAUTAM,SIR KINDLY LOOK INTO MARITIME SECTOR SPECIALLY...,1 month 3 days ago,Maritime
5,Mamata Singh,महोदय आपके सुशासन में सम्बन्धित मंत्रालय के अध...,1 month 3 days ago,Others
6,Mamata Singh,महोदय यहां सोचने का विषय यह है कि 9 माह के अन्...,1 month 3 days ago,Others
7,Mamata Singh,महोदय 9 माह का समय बीत गया न जाने इस अवधि में ...,1 month 3 days ago,Others
8,Mamata Singh,महोदय 9 माह का समय बीत जाने के बाद भी आज तक C...,1 month 3 days ago,Others
9,Mamata Singh,"आदरणी प्रधानमंत्री जी,\nभारत सरकार\nमहोदय, साद...",1 month 3 days ago,Others


In [4]:
# Cleaning the comments
def clean_suggestion( comment ):
 
    # Only allow letters
    letters_only = re.sub("[^a-zA-Z]", " ", comment) 
    
    # Only allow lowercase letters
    word_list = letters_only.lower().split()  
    
    # Remove all common English words
    stops = set(stopwords.words("english"))    
    
    # To view such common English words
    #print stops
    
    # A final list of all words we will be needing
    meaningful_words = [w for w in word_list if not w in stops]   
    
    # Joining the tokenized words back into one string
    return( " ".join( meaningful_words ))   

In [6]:
# Preparing the TRAINING DATA

# Get the count of suggestions you wish to clean for training purpose
# For this case it is '60' - '10'
count = df["Suggestion"].size - 10 

# List to store the cleaned data
clean_train_data = []

for i in xrange( 0, count ):
    clean_train_data.append( clean_suggestion(df["Suggestion"][i] ) )
    
# Finding the feature set => array of 0s and 1s where 0s : absence of words, 1s for presence

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1000) 

# Store the suggestion feature set as an array
# Arrays are easier to deal with
train_data_features = vectorizer.fit_transform(clean_train_data)
train_data_features = train_data_features.toarray()

# Store the categories as an array
train_category_array = np.array(df["Category"][:50])

distinct_train_category = set(train_category_array)
print train_data_features.shape
# The output of the above signifies that we have 50 rows of training data
# and each row is associated with 942 distinct words that are extracted from the suggestions
# These 942 entries are a bunch of 1s and 0s siginifying the presence and absence of those words
# in that suggestion

(50, 942)


In [8]:
# Experiment - To view the vocabulary
vocab = vectorizer.get_feature_names()
#print(vocab)

In [9]:
# Preparing the TESTING DATA

test_data = df["Category"][50:]

# count = 10 + 60 in this case
count = test_data.size + 50

clean_test_data = []

for i in xrange( 50, count ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_test_data.append( clean_suggestion(df["Suggestion"][i] ) )

# Same thing being done as above    
test_data_features = vectorizer.transform(clean_test_data)
test_data_features = test_data_features.toarray()

test_category_array = np.array(df["Category"][50:])

In [10]:
# CLASSIFIERS 

# The results are bound to improve with more data
# Applying Naive Bayes
NB_classifier = GaussianNB()
NB_classifier.fit(train_data_features, train_category_array)
# Making predictions
expected_category = test_category_array
predicted_category = NB_classifier.predict(test_data_features)

print(metrics.classification_report(expected_category, predicted_category))
print(metrics.confusion_matrix(expected_category, predicted_category))

             precision    recall  f1-score   support

    Banking       0.00      0.00      0.00         0
  Education       0.00      0.00      0.00         0
     Energy       0.00      0.00      0.00         1
     Health       0.00      0.00      0.00         1
       Jobs       0.00      0.00      0.00         1
        Law       0.00      0.00      0.00         2
     Others       0.00      0.00      0.00         2
    Pension       0.00      0.00      0.00         2
     Safety       0.00      0.00      0.00         0
    Traffic       0.00      0.00      0.00         1

avg / total       0.00      0.00      0.00        10

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 2 0]
 [1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [11]:
# Applying Random Forest Classifier

RF_classifier = RandomForestClassifier(n_estimators = 100) 
RF_classifier.fit( train_data_features, train_category_array )
predicted_category = RF_classifier.predict(test_data_features)

print(metrics.classification_report(expected_category, predicted_category))
print(metrics.confusion_matrix(expected_category, predicted_category))

             precision    recall  f1-score   support

     Energy       0.00      0.00      0.00         1
     Health       0.00      0.00      0.00         1
       Jobs       0.00      0.00      0.00         1
        Law       0.00      0.00      0.00         2
     Others       0.20      1.00      0.33         2
    Pension       0.00      0.00      0.00         2
    Traffic       0.00      0.00      0.00         1

avg / total       0.04      0.20      0.07        10

[[0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 1 0 0]]


In [12]:
# Applying KNN 

KNN_classifier = KNeighborsClassifier()
KNN_classifier.fit( train_data_features, train_category_array )
predicted_category = KNN_classifier.predict(test_data_features)

print(metrics.classification_report(expected_category, predicted_category))
print(metrics.confusion_matrix(expected_category, predicted_category))

             precision    recall  f1-score   support

     Energy       0.00      0.00      0.00         1
     Health       0.00      0.00      0.00         1
       Jobs       0.00      0.00      0.00         1
        Law       0.00      0.00      0.00         2
     Others       0.20      1.00      0.33         2
    Pension       0.00      0.00      0.00         2
    Traffic       0.00      0.00      0.00         1

avg / total       0.04      0.20      0.07        10

[[0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 1 0 0]]


In [13]:
# CLUSTERING

kmeans = KMeans(n_clusters=10, random_state=0).fit(train_data_features)
print kmeans.labels_
kmeans.predict(test_data_features)

[0 1 0 2 0 0 0 0 0 0 0 0 0 5 0 0 9 0 0 9 0 0 0 0 0 0 4 0 0 6 0 0 0 0 0 0 0
 0 0 8 0 7 0 0 0 0 0 0 3 0]




array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)