In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = pd.read_csv('/Independence100.csv')
print(df.head(5))

### 1. Know Your Data ###
df.info()
df.describe()

# missing values #
missing_count = df.isnull().sum() # the count of missing values
value_count = df.isnull().count() # the count of all values 
missing_percentage = round(missing_count / value_count * 100,2) #the percentage of missing values
missing_df = pd.DataFrame({'count': missing_count, 'percentage': missing_percentage}) #create a dataframe
print(missing_df)

# visualize missing value#
barchart = missing_df.plot.bar(y='percentage')
for index, percentage in enumerate(missing_percentage):
    barchart.text(index, percentage, str(percentage) + '%' )


### 2. Feature Engineering ###

# adding title_length
df['title_length'] = df['title'].apply(len)

# extracting month from date
df['month'] = pd.to_datetime(df['date']).dt.month.apply(str)

# whether the article has subtitle
df['with_subtitle'] = np.where(df['subtitle'].isnull(), 'Yes', 'No')

# drop unnecessary columns
df = df.drop(['id', 'subtitle', 'title', 'url', 'date', 'image', 'responses'], axis=1)

# populate the list of numeric attributes and categorical attributes
num_list = []
cat_list = []

for column in df:
 if is_numeric_dtype(df[column]):
        num_list.append(column)
 elif is_string_dtype(df[column]):
        cat_list.append(column)    

print(num_list)
print(cat_list)


### 3. Univaraite Analysis ###

# bar chart and histogram
for column in df:
    plt.figure(column, figsize = (4.9,4.9))
    plt.title(column)
 if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
 elif is_string_dtype(df[column]):
 # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')


### 4. Multivariate Analysis ###

# correation matrix and heatmap
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

# pairplot
sns.pairplot(df,height = 2.5)

# grouped bar chart
for i in range(0, len(cat_list)):
    primary_cat = cat_list[i]
 for j in range(0, len(cat_list)):
        secondary_cat = cat_list[j]
 if secondary_cat != primary_cat:
            plt.figure (figsize = (15,15))
            chart = sns.countplot(
                data = df,
                x= primary_cat, 
                hue= secondary_cat,
                palette = 'GnBu',
                order=df[primary_cat].value_counts().iloc[:10].index #show only TOP10
              )

# pairplot with hue
for i in range(0, len(cat_list)):
    hue_cat = cat_list[i]
    sns.pairplot(df, hue = hue_cat)

# box plot
for i in range(0, len(cat_list)):
    cat = cat_list[i]
 for j in range(0, len(num_list)):
        num = num_list[j]
        plt.figure (figsize = (15,15))
        sns.boxplot( x = cat, y = num, data = df, palette = "GnBu")