# Basics of Text Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

Update the path as needed to read in the content polluters and legitmate users csv files into separate dataframes. 

First, verify you have the files in the correct location, you should see 3 True's print out of the next cell.

In [2]:

print(os.path.isdir("data"))

dataDir = "/Users/mandysack/Desktop/DeepLearningAnalyticsProject-master/data/"

print(os.path.isfile(dataDir+"content_polluters.csv"))
print(os.path.isfile(dataDir+"legitimate_users.csv"))

bDcolnames=['UserID','CreatedAt','CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets','LengthOfScreenName','LengthOfDescriptionInUserProfile','BadUser'] 
badData = pd.read_csv(dataDir+"content_polluters.csv", names=bDcolnames, dtype = {'NumberOfFollowers' : np.int64})

nDcolnames=['UserID','CreatedAt','CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets','LengthOfScreenName','LengthOfDescriptionInUserProfile','BadUser']
normalData = pd.read_csv(dataDir+"legitimate_users.csv", names=nDcolnames)

True
True
True


List the column headers
### How many columns are there?

In [None]:
print(normalData.columns)
print(badData.columns)

List the number of rows, using the shape function will return the (rows, columns)
### How many rows and columns are there?

In [None]:
print(normalData.shape)
print(badData.shape)

Sort the dataframes by the 'CollectedAt' column

In [None]:
normalData.sort_values(by=['CollectedAt'], inplace=True)
badData.sort_values(by=['CollectedAt'], inplace=True)

Describe the dataframes

In [None]:
normalData.describe()

In [3]:
badData.describe()

Unnamed: 0,UserID,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,BadUser
count,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0
mean,91586090.0,2212.416775,2308.996,1135.044548,11.283445,60.113981,1.0
std,42364400.0,5376.689112,34158.8,4301.928411,2.630156,54.093454,0.0
min,6301.0,0.0,0.0,0.0,2.0,0.0,1.0
25%,59129330.0,326.5,65.0,16.0,9.0,0.0,1.0
50%,99935900.0,874.0,295.0,113.0,11.0,54.0,1.0
75%,122527000.0,1878.0,1224.0,636.0,13.0,104.0,1.0
max,173767000.0,109388.0,4309347.0,148047.0,15.0,474.0,1.0


Print the first 5 rows of the dataframes

In [None]:
normalData.head(5)

In [None]:
badData.head(5)

Print row 35 of each dataframe

In [None]:
print(normalData.iloc[35])
print(badData.iloc[35])

Print the 700th row of the 4th column of each dataframe

In [None]:
print(normalData.iat[700,4])
print(badData.iat[700,4])

What is the minimum and maximum of 'NumberOfFollowings' for each dataset

#### Does it match the max value given in describe?

In [None]:
print("minimum of normalData['NumberOfFollowings'] is {} ".format(normalData['NumberOfFollowings'].min()))
print("maximum of normalData['NumberOfFollowings'] is {} ".format(normalData['NumberOfFollowings'].max()))
print("minimum of badData['NumberOfFollowings'] is {} ".format(badData['NumberOfFollowings'].min()))
print("maximum of badData['NumberOfFollowings'] is {} ".format(badData['NumberOfFollowings'].max()))


Merge the datasets together, keeping the columns the same

Sort the values once again by the 'CollectedAt' column

In [None]:
mergedData = pd.concat([badData,normalData])
mergedData.sort_values(by=['CollectedAt'], inplace=True)

Save all 3 DataFrames as csv with your first initial and last name (i.e. msack for mandy sack)

In [None]:
normalData.to_csv(dataDir+"legitimate_users_msack.csv",index=False)
badData.to_csv(dataDir+"content_polluters_msack.csv",index=False)
mergedData.to_csv(dataDir+"mergedData_msack.csv")

 Read in the merged dataframe you just saved, and list the columns, you will need to update the path to your path
 
 ### What does the Unnamed: 0 column mean?

In [None]:
df_merged = pd.read_csv("/Users/mandysack/Desktop/DeepLearningAnalyticsProject-master/data/mergedData_msack.csv")
df_merged.columns


Make a copy of the merged dataframe

In [None]:
df_cp = df_merged.copy()

Drop the 'Unnamed: 0' and 'UserID' columns

In [None]:
df_cp.drop(columns=['Unnamed: 0', 'UserID'], inplace=True)
df_cp

Print the covariance of the dataframe

In [None]:
#Covariance
df_cp.cov()

Plot histograms of the dataframes, set the figure size to 10x10


In [None]:
df_cp.hist(figsize=(10,10))

Print the correlation of the dataframe


In [None]:
df_cp.corr()

Plot the correlation using matshow

In [None]:
plt.matshow(df_cp.corr())

Here is another way to plot the correlation

In [None]:
labels = [c[:10] for c in df_cp.corr().columns]

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.matshow(df_cp.corr(), cmap=plt.cm.RdYlGn)
plt.show()

Group the dataframe by ['BadUser'].

Then list the number of groups ->should be 2

In [None]:
gb = df_cp.groupby(['BadUser'],as_index=False)

gb.ngroups

copy the df_cp, then plot each of the columns

Plotting all of the columns separatly, you can see that things are hard to see any patterns.

Adding a rolling mean will enable you to better visually see the data 


In [None]:
df = df_cp.copy()
df['NumberOfFollowings'].plot()
df['NumberOfFollowers'].plot()
df['NumberOfTweets'].plot()
df['LengthOfScreenName'].plot()
df['LengthOfDescriptionInUserProfile'].plot()

Assign a variable to the df['NumberOfFollowings'].rolling().mean(), then plot the variable

Try with a rolling value of 25, 350, and 1000. What happens?

In [None]:
df_NOF_mean = df['NumberOfFollowings'].rolling(700).mean()
df_NOF_mean.plot()

Assign a variable to the df['NumberOfFollowers'].rolling().mean(), then plot the variable

Try with at least 3 different rolling values. What happens?

In [None]:
df_NOFs_mean = df['NumberOfFollowers'].rolling(2000).mean()
df_NOFs_mean.plot()

Assign a variable to the df['LengthOfScreenName'].rolling().mean(), then plot the variable

Try with at least 3 different rolling values. What happens?

In [None]:
df_LOSN_mean = df['LengthOfScreenName'].rolling(500).mean()
df_LOSN_mean.plot()

Assign a variable to the df['LengthOfDescriptionInUserProfile'].rolling().mean(), then plot the variable

Try with at least 3 different rolling values. What happens?

In [None]:
df_LODIUP_mean = df['LengthOfDescriptionInUserProfile'].rolling(1500).mean()
df_LODIUP_mean.plot()

Assign a variable to the df['NumberOfTweets'].rolling().mean(), then plot the variable

Try with at least 3 different rolling values. What happens?

Does the shape of this plot look different from the others?

In [None]:
df_NumOTW_mean = df['NumberOfTweets'].rolling(3500).mean()
df_NumOTW_mean.plot()

Plot the ['BadUser'] column

In [None]:
df['BadUser'].plot()

Create a plot of figsize 15x10, show the gridlines, 
plot all of the rolling mean variables that were just created with the appropriate label
plot the legend at loc=2
show the plot

How does this plot look different from the plot we first did?

In [None]:
plt.figure(figsize=[15,10])
plt.grid(True)
plt.plot( df_NOFs_mean ,label='Num of Followers (mean)')
plt.plot( df_NOF_mean ,label='Num of Followings (mean)')
plt.plot( df_LOSN_mean ,label='Len Screen Name (mean)')
plt.plot( df_LODIUP_mean ,label='Len Description User Profile (mean)')
plt.plot( df_NumOTW_mean, label='Num Of Tweets (mean)')
plt.legend(loc=2)
plt.show()

Create a plot of figsize 15x10, show the gridlines, 
plot the Len Screen Name and Len Description User Profile rolling mean variables that were just created with the appropriate label along with the Bad User 
plot the legend at loc=2
show the plot

How does this plot look different from the plot we first did?

In [None]:
plt.figure(figsize=[15,10])
plt.grid(True)
plt.plot( df['BadUser'] ,label='BadUser')
plt.plot( df_LOSN_mean ,label='Len Screen Name (mean)')
plt.plot( df_LODIUP_mean ,label='Len Description User Profile (mean)')
plt.legend(loc=2)
plt.show()

In [None]:
df_c2 = df_cp.copy()

In [None]:
df_c2['NumberOfFollowers'].head()

In [None]:
test = df_merged.copy()

In [None]:
test['NumberOfFollowers'].describe()

The next section will put an X on each of the lines where BadUser changes (i.e. from 1 to 0 or 0 to 1) - TODO Later

In [None]:
# A way to find out when the changes happened in all of the columns can be done like this
df.ne(df.shift()).apply(lambda x: x.index[x].tolist())

In [None]:
#find the values of changes for the Bad User
df_cp['BadUser'].iloc[0]

In [None]:
#since this starts at 1, we will want to find when the the BadUser value goes to 0
df_diff_to_zero = ( df_cp['BadUser'].diff() == -1)
userAtZero = np.where(df_diff_to_zero)[0]

In [None]:
#we will want to find when the the BadUser value goes back to 1
df_diff_to_one = ( df_cp['BadUser'].diff() == 1) 
userAtOne = np.where(df_diff_to_one)[0]

In [None]:
#We want to draw an X at the point where the good users start and stop on the lines,
#We will need to find the y values when the x values change  
#col = ['NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 'LengthOfScreenName','LengthOfDescriptionInUserProfile']
#x_points = np.vstack((userAtZero, userAtOne))
#points = np.empty([len(col)*len(x_points), 2])
#points = np.array([])
#print(points.shape)
#for i in col:
#    print(i)
#    for x in x_points:
#        x = x.item()
#        new_point = df_cp.at[x,i]
#        #print(new_point)
#        #print(np.array([x,new_point]))
#        #points = np.insert(a, 1, 5)
#        points = np.append(points, np.array([x, new_point]),axis=0)
#        print(points)
#print(points)
#points.shape