In [1]:
#Loading data into Python
import pandas as pd #import pandas library
import numpy as np #import numpy library

df = pd.read_csv('ThaiLiveSell.csv') #read data
df.drop('status_id', axis=1, inplace=True) #drop status_id column
df

Unnamed: 0,status_type,status_published,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys
0,video,4/22/2018 6:00,529,512,262,432,92,3,1,1,0
1,photo,4/21/2018 22:45,150,0,0,150,0,0,0,0,0
2,video,4/21/2018 6:17,227,236,57,204,21,1,1,0,0
3,photo,4/21/2018 2:29,111,0,0,111,0,0,0,0,0
4,photo,4/18/2018 3:22,213,0,0,204,9,0,0,0,0
5,photo,4/18/2018 2:14,217,6,0,211,5,1,0,0,0
6,video,4/18/2018 0:24,503,614,72,418,70,10,2,0,3
7,video,4/17/2018 7:42,295,453,53,260,32,1,1,0,1
8,photo,4/17/2018 3:33,203,1,0,198,5,0,0,0,0
9,photo,4/11/2018 4:53,170,9,1,167,3,0,0,0,0


In [2]:
#Cleaning data
df.drop_duplicates(inplace=True) #drop duplicates if any
df['status_published'] = pd.to_datetime(df['status_published']) #change data type of status_published to make it look nicer

#Creating new attributes to have a more specific analysis
#extracting specific information out of the status_published column
df['hour_published'] = df['status_published'].dt.hour

df['dayofweek_published'] = df['status_published'].dt.dayofweek #0 is Monday, 6 is Sunday.
def text_day(x): #define the function to convert approriate values into numerical values
    if x==0:
        return 'Monday'
    if x==1:
        return 'Tuesday'
    if x==2:
        return 'Wednesday'
    if x==3:
        return 'Thursday'
    if x==4:
        return 'Friday'
    if x==5:
        return 'Saturday'
    else:
        return 'Sunday'
df['day_of_week_published'] = df['dayofweek_published'].apply(text_day)

def reactions_level(x):
    if x<100:
        return 'Low'
    if 99<x<1000:
        return 'Medium'
    else:
        return 'High'
df['level_of_reactions'] = df['num_reactions'].apply(reactions_level)
    
df

Unnamed: 0,status_type,status_published,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,hour_published,dayofweek_published,day_of_week_published,level_of_reactions
0,video,2018-04-22 06:00:00,529,512,262,432,92,3,1,1,0,6,6,Sunday,Medium
1,photo,2018-04-21 22:45:00,150,0,0,150,0,0,0,0,0,22,5,Saturday,Medium
2,video,2018-04-21 06:17:00,227,236,57,204,21,1,1,0,0,6,5,Saturday,Medium
3,photo,2018-04-21 02:29:00,111,0,0,111,0,0,0,0,0,2,5,Saturday,Medium
4,photo,2018-04-18 03:22:00,213,0,0,204,9,0,0,0,0,3,2,Wednesday,Medium
5,photo,2018-04-18 02:14:00,217,6,0,211,5,1,0,0,0,2,2,Wednesday,Medium
6,video,2018-04-18 00:24:00,503,614,72,418,70,10,2,0,3,0,2,Wednesday,Medium
7,video,2018-04-17 07:42:00,295,453,53,260,32,1,1,0,1,7,1,Tuesday,Medium
8,photo,2018-04-17 03:33:00,203,1,0,198,5,0,0,0,0,3,1,Tuesday,Medium
9,photo,2018-04-11 04:53:00,170,9,1,167,3,0,0,0,0,4,2,Wednesday,Medium


In [3]:
#Organizing data: drop columns that are not useful for the analysis
df.drop('status_published', axis=1, inplace=True) 
df.drop('num_shares', axis=1, inplace=True)
df.drop('num_comments', axis=1, inplace = True)
df.drop('num_likes', axis=1, inplace = True)
df.drop('num_loves', axis=1, inplace = True)
df.drop('num_wows', axis=1, inplace = True)
df.drop('num_hahas', axis=1, inplace = True)
df.drop('num_sads', axis=1, inplace = True)
df.drop('num_angrys', axis=1, inplace = True)
df.drop('dayofweek_published', axis=1, inplace = True)

df.shape

(6996, 5)

In [4]:
df = df.take(np.random.permutation(len(df))[:401]) #take random rows to reduce file's size to be able to use on Weka

df.to_csv('ThaiDemo.csv',index=False,header=True) #saving the selected random rows into a new csv file

In [5]:
df

Unnamed: 0,status_type,num_reactions,hour_published,day_of_week_published,level_of_reactions
111,photo,203,3,Monday,Medium
1596,photo,58,5,Tuesday,Low
938,photo,320,5,Monday,Medium
5401,video,54,7,Sunday,Low
6422,video,23,7,Monday,Low
931,photo,151,10,Sunday,Medium
2373,photo,11,6,Monday,Low
4317,video,0,8,Friday,Low
729,photo,96,12,Monday,Low
2151,photo,24,10,Wednesday,Low


In [6]:
df.corr()

Unnamed: 0,num_reactions,hour_published
num_reactions,1.0,0.028234
hour_published,0.028234,1.0


In [7]:
df.mode().transpose()

data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])
data_types

missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])
missing_data_counts

present_data_counts = pd.DataFrame(df.count(), columns=['Present Values'])
present_data_counts

unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df.columns.values):
    unique_value_counts.loc[v] = [df[v].nunique()]
    
unique_value_counts

minimum_values = pd.DataFrame(columns=['Minimum Values'])
for v in list(df.columns.values):
    minimum_values.loc[v] = [df[v].min()]
    
minimum_values

maximum_values = pd.DataFrame(columns=['Maximum Values'])
for v in list(df.columns.values):
    maximum_values.loc[v] = [df[v].max()]
    
maximum_values

# data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(unique_value_counts).join(minimum_values).join(maximum_values)

pd.concat([present_data_counts, missing_data_counts, unique_value_counts, minimum_values, maximum_values], axis=1)

Unnamed: 0,Present Values,Missing Values,Unique Values,Minimum Values,Maximum Values
status_type,401,0,4,link,video
num_reactions,401,0,217,0,2636
hour_published,401,0,22,0,23
day_of_week_published,401,0,7,Friday,Wednesday
level_of_reactions,401,0,3,High,Medium
