In [1]:
# Basic imports
import os
import time
import timeit
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning packages
from sklearn import ensemble
from sklearn.feature_selection import chi2, f_classif, SelectKBest 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize

# Natural Language processing
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_rcv1
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, SpectralClustering, AffinityPropagation
from scipy.spatial.distance import cdist

# Plotly packages
import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
import cufflinks as cf
import ipywidgets as widgets
from scipy import special
py.offline.init_notebook_mode(connected=True)

In [3]:
# Import the data
fcc_comments = pd.read_csv("FCC Comments/deidentified_survey_results.csv")


Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.



In [5]:
# Preview the dataframe
fcc_comments.head()

Unnamed: 0.1,Unnamed: 0,docid,campaign,submitted,email_hash,email_domain,short_comment,email_valid,not_original_commenter,received,block,country,region,bounced_or_filtered,send_failed,pro_repeal
0,0,19970698,3,2017-08-11T14:57:52+00:00,ed33934cc5701a337e210e61d5c12567,yahoo.com,"In 2015, Chairman Tom Wheelers Federal Commun...",True,,,,,,False,False,True
1,1,20235884,3,2017-08-11T15:09:48+00:00,212ce60ccc6d3f0afed35e2d4d5c1050,gmail.com,"In 2015, Chairman Tom Wheelers Federal Commun...",True,,,,,,False,False,True
2,2,16906326,3,2017-08-04T20:03:40+00:00,07ef30726e18a5672f8cac9c2c45c72e,sbcglobal.net,"In 2015, Chairman Tom Wheelers Federal Commun...",True,,,,,,False,False,True
3,3,20626870,3,2017-08-11T19:46:40+00:00,4b6479817123ced072a67e425d683fd6,yahoo.com,"In 2015, Chairman Tom Wheelers Federal Commun...",True,,,,,,False,False,True
4,4,20354907,3,2017-08-11T16:14:38+00:00,71e22da0855795acdb733cd0018ba95a,yahoo.com,"In 2015, Chairman Tom Wheelers Federal Commun...",True,,,,,,False,False,True


In [18]:
list(fcc_comments.columns.values)

['Unnamed: 0',
 'docid',
 'campaign',
 'submitted',
 'email_hash',
 'email_domain',
 'short_comment',
 'email_valid',
 'not_original_commenter',
 'received',
 'block',
 'country',
 'region',
 'bounced_or_filtered',
 'send_failed',
 'pro_repeal']

In [24]:
# Drop [] because they don't contain any valuable information
fcc_comments.drop(['Unnamed: 0', 'docid', 'email_hash'], axis=1, inplace=True)

In [25]:
# Print the size of the dataframe
print("Size: " + str(fcc_comments.shape))

Size: (449659, 13)


In [27]:
# Print number of unique categories for all non-numerical columns
fcc_comments.select_dtypes(include=['object']).nunique()

submitted        227369
email_domain       7736
short_comment     43722
email_valid           2
received          12161
block                 5
country              78
region              160
dtype: int64

In [33]:
comments = fcc_comments['short_comment'].value_counts()
comments.head(20)

The Open Internet rules (net neutrality) are extremely important to me. I don't want the ISP to have the power to block websites, slow them down, give some sites advantage on others, or split the internet into fast lanes for companies that ...                11551
Please save the internet from the corporations. Tom Wheeler was right. Let the new neutrality stand....                                                                                                                                                            10255
We need net neutralityto continue. A free and open internet is the single greatest technology of our time, and control should not be at the mercy of corporations....                                                                                              10244
The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore p

# Text Cleaning

In [34]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [35]:
# Remove annoying punctuation from the articles
pd.options.display.max_colwidth = 200
fcc_comments['short_comment'] = fcc_comments['short_comment'].map(lambda x: text_cleaner(str(x)))
fcc_comments['short_comment'].head()

0    In 2015, Chairman Tom Wheelers Federal Communications Commission (FCC) imposed restrictive Title II, utility-style regulations under the guise of an open internet. Not only have these regulatio...
1    In 2015, Chairman Tom Wheelers Federal Communications Commission (FCC) imposed restrictive Title II, utility-style regulations under the guise of an open internet. Not only have these regulatio...
2    In 2015, Chairman Tom Wheelers Federal Communications Commission (FCC) imposed restrictive Title II, utility-style regulations under the guise of an open internet. Not only have these regulatio...
3    In 2015, Chairman Tom Wheelers Federal Communications Commission (FCC) imposed restrictive Title II, utility-style regulations under the guise of an open internet. Not only have these regulatio...
4    In 2015, Chairman Tom Wheelers Federal Communications Commission (FCC) imposed restrictive Title II, utility-style regulations under the guise of an open internet. Not only

# Natural Language Processing

In [37]:
lemmatizer = WordNetLemmatizer()

# Reduce all text to their lemmas
for comment in fcc_comments['short_comment']:
    comment = lemmatizer.lemmatize(comme nt)

In [39]:
# Predict job title using text data
X = fcc_comments['short_comment']
y = fcc_comments['pro_repeal']

# Create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)