Preliminary Data Analysis of Mozilla SurveyGizmo Feedback Data

Note: must also install vaderSentiment (pip install vaderSentiment), and assumes that files 'Top Sites for Report Analysis.csv' and '20180912113810-SurveyExport.csv' are located in the same directory as the file

In [1]:
import pandas as pd
import re

In [2]:
# for columns A and B in the top 100, get strings in cells, comma, split by comma, 
#then save and check if data contains these values
sites = pd.read_csv('C:/Users/amritpd/Downloads/Top Sites for Report Analysis.csv', usecols=['Domains','Brand'])
#, skiprows = 50, nrows = 25
#display(sites)
siteList = list(sites.values.flatten())
issuesFile = open('issues.txt', 'r')
issuesList = issuesFile.read().split(',')

#remove commas ('salesforce.com, force.com')
for site in siteList:
    if ',' in site:
        siteList += site.split(',')
        
siteList = [site.strip('.*') for site in list(filter(lambda site: ',' not in site, siteList))]
issuesList = [issue.strip('\n') for issue in issuesList]

#print(issuesList)

In [3]:
fields = ['KI', 'KN', 'KP']
#read in raw survey data from CSV files. Only want certain columns
survey_cols = ["Response ID","Time Started","Date Submitted","Status","Language","Referer","Extended Referer","User Agent","Extended User Agent","Longitude","Latitude","Country","City","State/Region","Postal","How does Firefox make you feel?","OS","To help us understand your input, we need more information. Please describe what you like. The content of your feedback will be public, so please be sure not to include personal information such as email address, passwords or phone number.","To help us understand your input, we need more information. Please describe your problem below and be as specific as you can. The content of your feedback will be public, so please be sure not to include personal information such as email address, passwords or phone number.","If your feedback is related to a website, you can include it here:"]
df = pd.read_csv("C:/Users/amritpd/Downloads/20180928025007-SurveyExport.csv", encoding = "ISO-8859-1", nrows=2500, usecols=survey_cols)
#some data cleaning and selection 
#rename some long column names
df.rename(columns={ survey_cols[15]: 'Binary Sentiment', survey_cols[17]: 'Positive Feedback', survey_cols[18]: 'Negative Feedback', survey_cols[19]: 'Relevant Site'}, inplace=True)
df = df.fillna(''); #repalce NaNs with blanks
df = df.loc[df['Status'] == 'Complete'] #Only want completed surveys
df = df.loc[df['Language'] == 'English'] #Only want english rows
#Convert to df friendly date-times
df["Date Submitted"] = pd.to_datetime(df["Date Submitted"])
df["Time Started"] = pd.to_datetime(df["Time Started"])


if df.empty: #need to handle empty case later
    print('DataFrame is empty!')
else:
    print('Not empty!')

Not empty!


In [4]:
#Derive certain columns
#start with basic sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

analyzer = SIA()
results = []

#just append & analyze the -ve/+ve feedback for now if user gave both
#df[['Neg', 'Neu', 'Pos', 'Compound']] = df['Text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
for index, row in df.iterrows(): 
    pol_score = analyzer.polarity_scores(row['Positive Feedback'] + row['Negative Feedback'])
    results.append(pol_score)

df2 = pd.DataFrame.from_records(results)    
# df['Neg'], df['Neu'], df['Pos'], 'Compound']] = results

df = pd.merge(df, df2, left_index=True, right_index=True)



In [5]:
#returns first occurence of an element from the string that is in the list. Obvs needs to be improved
def mentionedSite (series):
    combined = series['Relevant Site'] + series['Positive Feedback'] + series['Negative Feedback']
    sites = [site.lower() for site in siteList if site in combined]
    return ', '.join(sites)

def mentionedIssue (series):
    combined = series['Positive Feedback'] + series['Negative Feedback']
    issues = [issue.lower() for issue in issuesList if issue in combined]
    return ', '.join(issues)
    
#See if any of the top sites are mentioned in a list, see if any issues are mentioned 
df['Sites'] = df.apply(mentionedSite, axis=1 )
df['Issues'] = df.apply(mentionedIssue, axis=1)
    

In [8]:
#Some very basic visualizations
import ipywidgets as widgets

#quick overview of the number of sads/happies over time. You can notice a trend. 
start_date = df["Date Submitted"].min()
end_date = df["Date Submitted"].max()
display("Most Recent Date: ", end_date)

dates = pd.date_range(start_date, end_date, freq='D')

options = [(date.strftime(' %d %b %Y '), date) for date in dates]
index = (0, len(options)-1)

slide = widgets.SelectionRangeSlider(
    options=options,
    index=index,
    description='Dates',
    orientation='horizontal',
    layout={'width': '500px'}
)
#Trends over time. Look at binary sentiment for a period of time, most -ve comments at that time
def binary_sentiment(slider):
    subset = df[(df['Date Submitted'] >= slider[0]) & (df['Date Submitted'] <= slider[1])]
    display("Happy-Sadness counts in this time interval: ", subset["Binary Sentiment"].value_counts())
        
widgets.interactive(binary_sentiment,slider=slide)
# I noticed a bunch of new feedback after December 19th 2016.
# This could be because of Mozilla's latest release of V50.1 earlier in the week.
# Probably introduced some breaking changes. 

'Most Recent Date: '

Timestamp('2016-12-20 22:07:08')

In [7]:
#Select top site, show relevant comments sorted by most negative and most recent
topSites = ['facebook', 'yahoo', 'google', 'twitter', 'netflix']
tsdrop = widgets.Dropdown(description="Top Sites", options=topSites)
def basic_site_analysis(drop):
    #display the most negatively associated comments
    display("MOST NEGATIVE COMMENTS for " + drop)
    subset = df[df['Relevant Site'].str.contains(drop)]
    display(subset.nsmallest(3, 'compound'))
    display("MOST RECENT COMMENTS for " + drop)
    display(subset.nlargest(3, 'Date Submitted'))
    


#Select issue, "" "" 
widgets.interactive(basic_site_analysis,drop=tsdrop)
#tsdrop.observe(basic_site_analysis, names='value')