In [None]:
#import required libraries for the data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#read dataset
df = pd.read_csv('example.csv')
df.head() #display first 5 rows
df.info() #gives basic information about the dataset
df.describe() #gives statistical information about numerical variables
df.shape #gives total count of rows and columns
df.isnull().sum() #check null values
df.loc[pd.isna(df['variable']), :].index #returns the index of missing value rows in the specified column

# To get percentage of missing data in each column
pharma_data.isnull().sum() / len(pharma_data) * 100

#check for duplicates
df.duplicated().sum()

#display duplicate rows
df[df.duplicated(keep=False)]

#drop duplicate records
df.drop_duplicates(inplace=True)


#returns count of unique value in each column of the dataset
df.nunique()

#returns an array of unique values in the column
pharma_data['variable'].unique()

#returns count of each unique values in the column
df['variable'].value_counts()

# This helps us in profiling the dataset
# Profiling helps you understand data better!
!pip install pandas-profiling
import pandas_profiling 
profile = pandas_profiling.ProfileReport(df)
profile.to_file("profile_report.html") #exporting to html file for a better view 

#replace missing values with mode() for a catogorical variable.
#fillna(df['Treated_with_drugs'].mode() returns a series object hence use .iloc[0] # ref: https://stackoverflow.com/questions/42789324/pandas-fillna-mode
df['variable'].fillna(df['variable'].mode()[0], inplace = True) 

#fill missing values with median since rest all are numerical columns
df.fillna(df.median(), inplace = True)

#drop a column
df.drop('variable', axis=1, inplace=True)

#convert categorical variable to numerical variables
df = pd.get_dummies(df)

#if we have a categorical varibale with multiple classes and you want to convert the variable to a single numerical column
#convert to category dtype
df['categorical_variable'] = df['categorical_variable'].astype('category')
#use '.cat.codes' to create new column with encoded value
df['new_column'] = pharma_data['categorical_variable'].cat.codes

#considering all variables except "Dataset" as input variables 
X=df.drop(columns= ['targetVariable']) #exclude target variable. Can drop multiple columns if unnecessary
y=df['targetVarible'] #target variable

#split dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)



# visualizations

In [None]:
#null values visualization
plt.figure(figsize=(14, 7))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

#display a bar graph with different classes and their counts in a variable
plt.figure(figsize=(10, 6))
sns.countplot(x= 'any_Variable',data= df, palette='Accent')

#check relation between a dependent and independent variable
plt.figure(figsize=(10, 6))
sns.countplot(x= 'target',hue='independent_variable',data= df, palette='Accent')

#display the boxplot with target and an independent variable
plt.figure(figsize=(10, 6))
sns.boxplot(x='target', y='independent_variable', data=df, palette='Accent')

#heatmap
plt.figure(figsize=(12, 7))
sns.heatmap(df.corr(), annot = True)



# lazy predict

In [None]:
#lazypredict - gives accuracy of different models 
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install lazypredict
!pip install pytest
import lazypredict
import lightgbm
import xgboost
import catboost
import pytest

#for classification models
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

#for regression models
from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)
models