In [None]:
import pandas as pd 
import numpy as np
import scipy as sp
import seaborn as sns
sns.set_style('darkgrid')
import regex as re
import matplotlib.pyplot as plt

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#loading the data
loc = ('CSRIC_Best_Practices.csv')

In [None]:
df = pd.read_csv(loc)
df.head()

Unnamed: 0,BP Number,Priority,Description,Network Type(s),Industry Role(s),Keywords,Public Safety and Disaster,Reference,cable,internet/Data,satellite,wireless,wireline,Service Provider,Network Operator,"Priority (1,2,3)",Equipment Supplier,Property Manager,Government,Public Safety
0,12-10-0436,Highly Important,"Network Operators, Service Providers, and Publ...",Cable; Internet/Data; Satellite; Wireless; Wir...,Service Provider; Network Operator; Public Saf...,Network Operations; Procedures;,True,,True,True,True,True,True,True,True,2.0,False,False,False,True
1,12-10-0437,Highly Important,Network Operators and Service Providers should...,Internet/Data;,Service Provider; Network Operator;,Cyber Security; Network Operations; Network Pr...,False,,False,True,False,False,False,True,True,2.0,False,False,False,False
2,12-10-0440,Highly Important,Network Operators and Service Providers should...,Internet/Data;,Service Provider; Network Operator;,Industry Cooperation; Network Operations;,False,,False,True,False,False,False,True,True,2.0,False,False,False,False
3,12-10-0447,Important,Network Operators and Service Providers should...,Cable; Internet/Data; Satellite; Wireless; Wir...,Service Provider; Network Operator;,Liaison; Network Operations;,False,Note: This Best practice could impact 9-1-1 op...,True,True,True,True,True,True,True,1.0,False,False,False,False
4,12-10-0448,Highly Important,"Equipment Suppliers should where feasible, pr...",Cable; Internet/Data; Satellite; Wireless; Wir...,Equipment Supplier;,Hardware; Network Elements; Network Provisioni...,False,,True,True,True,True,True,False,False,2.0,True,False,False,False


In [None]:
#view any null values, replace any blank spaces with an underscore, reformat the data frame index
def eda(dataframe):
    dataframe.columns = dataframe.columns.str.replace(" ", "_")
    print("missing values{}".format(dataframe.isnull().sum().sum()))
    print("dataframe index: {}".format(dataframe.index))
    print("dataframe types: {}".format(dataframe.dtypes))
    print("dataframe shape: {}".format(dataframe.shape))
    print("dataframe describe: {}".format(dataframe.describe())) 
    print("duplicates{}".format(dataframe[dataframe.duplicated()].sum()))
    for item in dataframe:
        print(item)
        print(dataframe[item].nunique())

eda(df)

missing values778
dataframe index: RangeIndex(start=0, stop=1092, step=1)
dataframe types: BP_Number                      object
Priority                       object
Description                    object
Network_Type(s)                object
Industry_Role(s)               object
Keywords                       object
Public_Safety_and_Disaster       bool
Reference                      object
cable                            bool
internet/Data                    bool
satellite                        bool
wireless                         bool
wireline                         bool
Service_Provider                 bool
Network_Operator                 bool
Priority_(1,2,3)              float64
Equipment_Supplier               bool
Property_Manager                 bool
Government                       bool
Public_Safety                    bool
dtype: object
dataframe shape: (1092, 20)
dataframe describe:        Priority_(1,2,3)
count       1006.000000
mean           1.727634
std            

In [None]:
#a function to convert NaN's in the data set to 'None' for string objects.
def convert_str_nan(data):
    return data.astype(object).replace(np.nan, 'None', inplace = True)
convert_str_nan(df)

In [None]:
df.rename(columns = {
    'Priority_(1,2,3)': 'Priorities'
},
inplace = True)
df['Priorities'].value_counts()
df['Priorities'] = [0 if i == 1 else 1 for i in df['Priorities']]
df['Priorities'].value_counts()

1    627
0    465
Name: Priorities, dtype: int64

In [None]:
df['Description'] = df.Description.map(lambda x: re.sub('\s[\/]?r\/[^s]+', ' ', x))
df['Description'] = df.Description.map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
df['Description'] = df.Description.map(lambda x: re.sub('(service providers|equipment suppliers|network operators|property managers|public safety)[s]?', ' ', x,  flags = re.I))

In [None]:
#a function to convert NaN's in the data set to 'None' for string objects
def convert_str_nan(data):
    return data.astype(object).replace(np.nan,'None',inplace = True)

convert_str_nan(df)

In [None]:
#preprocessing the data
def preprocessed_columns(dataframe = df, 
                        column = 'Description', 
                        new_lemma_column = 'lemmatized', 
                        new_stem_column = 'stemmed',
                        new_token_column = 'tokenized',
                        regular_expression = r'\w+'): 
    
    tokenizer = RegexpTokenizer(regular_expression)     
    lemmatizer = WordNetLemmatizer()                     
    stemmer = PorterStemmer()                            
    
    lemmatized = []                                      
    stemmed = []                                         
    tokenized = []
    
    
    for i in dataframe[column]:                        
        tokens = tokenizer.tokenize(i.lower())           
        tokenized.append(tokens)
        lemma = [lemmatizer.lemmatize(token) for token in tokens]     
        lemmatized.append(lemma)                                      
        stems = [stemmer.stem(token) for token in tokens]            
        stemmed.append(stems)                                         
     
    dataframe[new_token_column] = [' '.join(i) for i in tokenized]    
    dataframe[new_lemma_column] = [' '.join(i) for i in lemmatized]   
    dataframe[new_stem_column] = [' '.join(i) for i in stemmed]   
    
    return dataframe

processed = preprocessed_columns(df)

In [None]:
#order to control common stop words in the English language
cv = CountVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 25, max_df = .95)
cv_df_token = pd.SparseDataFrame(cv.fit_transform(processed['tokenized']), columns = cv.get_feature_names())
cv_df_token.fillna(0, inplace = True)
cv_df_token

Unnamed: 0,access,address,analysis,applicable,applications,applies,applies ng9,appropriate,associated,authentication,...,traffic,training,unauthorized,use,used,user,users,using,wireless,work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,1,0,0,0,1,0,0,0,1,2,...,1,0,0,0,0,0,0,0,0,0
1088,5,0,0,0,0,0,0,0,1,2,...,1,0,2,1,0,0,0,0,0,0
1089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1090,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
#grouping the data
df_really_important = processed[processed['Priorities'] == 1]
df_not_important = processed[processed['Priorities'] == 0]
print(df_really_important.shape)
print(df_not_important.shape)

(627, 23)
(465, 23)


In [None]:
X_1 = processed['tokenized']

In [None]:
X_1_train, X_1_test, y_train, y_test = train_test_split(X_1, y, test_size = 0.3, stratify = y, random_state = 42)

In [None]:
pipe_cv = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])
params = {
    'lr__C':[0.6, 1, 1.2],
    'lr__penalty':["l1", "l2"],
    'cv__max_features':[None, 750, 1000, 1250],
    'cv__stop_words':['english', None],
    'cv__ngram_range':[(1,1), (1,4)]
}

In [None]:
#applying pipeline toour model
gs_lr_tokenized_cv = GridSearchCV(pipe_cv,param_grid=params,cv = 5)
gs_lr_tokenized_cv.fit(X_1_train,y_train)
gs_lr_tokenized_cv.score(X_1_train, y_train)


0.943717277486911

In [None]:
gs_lr_tokenized_cv.score(X_1_test, y_test)

0.7134146341463414

In [None]:
#getting best parameters that shows the accuracy
gs_lr_tokenized_cv.best_params_

{'cv__max_features': None,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': 'english',
 'lr__C': 1.2,
 'lr__penalty': 'l1'}

In [None]:
#predicting the value
coefs = gs_lr_tokenized_cv.best_estimator_.steps[1][1].coef_
words = pd.DataFrame(zip(cv.get_feature_names(), np.exp(coefs[0])))
words = words.sort_values(1)

In [None]:
words

Unnamed: 0,0,1
99,protect,0.504093
95,prevent,0.587844
121,signaling,0.598349
52,implement,0.690333
133,training,0.797896
...,...,...
43,event,1.193831
23,control,1.339411
74,network,1.536078
60,internal,1.617070
