# Mina's Toolkit

# imports

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import datascience
import seaborn as sns 
import scipy.stats as stats
import sklearn as sk
from ast import literal_eval
%matplotlib inline

#suppress warnings
import warnings 
warnings.filterwarnings('ignore')


# Basic Dataframe operations 

## Mounting Google Drive

In [None]:
#Mount Google Drive 
from google.colab import drive
drive.mount('/content/drive')

## Loading a dataset from a csv

In [None]:
df = pd.read_csv('path_to_df', sep=',')

#you can change the separator to '\t' if you want to use tab separated files

df = pd.read_csv('path', sep='\t')



## Shape, describe, info

In [None]:
X = 5
df.head(X) #check the first X rows of the dataframe 
df.tail(X) #check the last X rows of the dataframe 
df.sample(X) #samples X rows of the dataframe randomly

In [None]:
df.shape
df.describe()
df.info()

add this line to change the appearance to be much nicer

In [None]:
df.describe().style.background_gradient(cmap='Greens')
#you can change the cmap to dozens of cmaps, check the online manual for them 

## Set a new index 

In [None]:
df.set_index('newIndex', inplace=True)

## Duplicates

check for duplicates by a specific column

In [None]:
df['colName'].duplicated().sum()

check for duplicates by a subset of columns

In [None]:
df.duplicated(subset=[colName1, colName2]).sum()

#or

cols = df.columns.tolist() #change to whatever subset you want, here I am checking for all columns 
df[cols].duplicated().sum()



### Dropping Duplicates


In [None]:
df.drop_duplicates(subset=[colName1, colName2, etc], keep='first', inplace=True)

## Value counts 

In [None]:
#get value counts for a column 
df['colName'].value_counts()

#don't drop nulls from it
df['colName'].value_counts(dropna=False)

## Normalized Value counts

In [None]:
df['colName'].value_counts(normalize=True)

## getting numerical or categorical columns

In [None]:
numerical_Cols = df.select_dtypes(include=['int64', 'float64']).columns.to_list()

categoricalCols = df.select_dtypes(include=['object']).columns.to_list()

## getting a list of columns in data

In [None]:
colsList = df.columns.tolist()

## Check for missing values in a Dataframe

In [None]:
#check for missing values sorted descendingly
df.isnull().sum().sort_values(ascending=False)

## Fill missing values

In [None]:
#you can use .mean(), .mode(), or .median(), or even write a specific number here


#inplace missing values replacement 
df['colName'].fillna(df['colName'].mean(), inplace=True)

#or in a new column 

df['colNameNoMissing'] = df['colName'].fillna(df['colName'].mean())

## Dropping Missing values

In [None]:
df.dropna(inplace=True)

#for a specific column
df['colName'].dropna(inplace=True)

## Dropping columns from a dataframe

In [None]:
newDf = df.drop(['colName'], axis=1)

#or

dropList = ['colName1', 'colName2']
df.drop(dropList, axis=1, inplace=True)

## Drop row by its id

In [None]:
#drop a row by its id 
df.drop(1299, axis=0, inplace=True)


## Iterate over a dataframe

In [None]:
for index, row in df.iterrows():
    #access row by column name
    if row['colName'] == 'value':
        #do something
        pass

In [None]:
#loop over iteritems
for colName, col in df.iteritems():
    #access column by column name
    if colName == 'colName':
        #do something
        pass

In [None]:
for (index, row) in df.iterrows():
     if pd.isnull(row.loc['colName']): 
        df.loc[index, 'colName'] = 'replacemenetValue'

## Accessing a specific row 

In [None]:
#this will modify any row whose colName value is equal to compareValue and set it to valuetoSet
valuetoSet = 0
compareValue = 5
df.loc[df.loc[:, 'colName'] == compareValue , 'colName'] = valuetoSet


## Selecting rows by logical filters

In [None]:
selectedDf = df[df['colName'] == 'value']

#combine two filters 

selectedDf = df[ df['colName'] == 'value' & df['colName2'] == 'value2']

## Lambda functions on rows (better than loops)

In [None]:
#apply a lambda function to a column

df['LambdacolName'] = df['colName'].apply(lambda x: x + 1)

#or you can call a function 

def function(X):
    return X + 1


modDfObj = df.apply(lambda x: np.square(x) if x.name == 'z' else x)

df["LambdacolName"] = df.apply(lambda x: function(x['colName']), axis=1)



# Visualization

## Regression plots


In [None]:
sns.regplot(x = 'col1', y = 'col2', data = df,scatter_kws={"color": "black"}, line_kws={"color": "red"})

## Histograms

In [None]:
df['colName'].hist()

## Count plots

In [None]:
sns.countplot(x=colName, data=df, hue='diff')

## Annotating barplots

In [None]:
for col in colsList: 
  fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,6))  # 1 row, 2 columns
  df[df['colName'] == 'Yes'][col].value_counts(normalize=True).plot(kind='bar', ax=ax1)
  for p in ax1.patches:
    ax1.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

  df[df['colName'] == 'No'][col].value_counts(normalize=True).plot(kind='bar', ax=ax2)
  for p in ax2.patches:
    ax2.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
  plt.tight_layout()
  plt.show()

## SNS barplots


In [None]:
sns.barplot(x="xColName", y="yColName", data=df,palette="rainbow")


## Sorted barplots

In [None]:
plt.figure(figsize=(20, 15))
#sort sns barplot 
#get the average saleprice per neighborhood 
perNei = df.groupby('Neighborhood').mean()['SalePrice'].sort_values(ascending=False)

sns.barplot(x=perNei.index, y=perNei)

## SNS pairplots

In [None]:
sns.pairplot(df, x_vars=colsList, y_vars=['Price'])
  

## Box plots

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(y='yColName', x='colName', data=df)
plt.show()

In [None]:
#pandas boxplots
df["colName"].plot.box(figsize=(8,8),color = 'blue')


## QQ plots

In [None]:
import scipy.stats as stats
from statsmodels.graphics.gofplots import qqplot

def qq_plots(df):
    plt.figure(figsize=(10, 4))
    qqplot(df,line='s')
    plt.title("Normal QQPlot")
    plt.show()

## Dist plots

In [None]:
sns.distplot(a=df['colName'])

#optional: 

sns.distplot(a=df['colName'], bins=20, kde=False)

## Nice distripution plots for group of columns

In [None]:
def dist_custom(dataset, columns_list, rows, cols, suptitle, size=(16,20), y=0.92):
    fig, axs = plt.subplots(rows, cols,figsize=size)
    fig.suptitle(suptitle,y=y, size=16)
    axs = axs.flatten() 
    for i, data in enumerate(columns_list):
        mean, median = dataset[data].mean(), dataset[data].median()
        graph = sns.histplot(dataset[data], ax=axs[i])
        graph.axvline(mean, c='red',label='mean')
        graph.axvline(median, c='green',label='median')
        plt.legend()
        axs[i].set_title(data + ', skew: '+str(round(dataset[data].skew(axis = 0, skipna = True),2)))
        
dist_custom(dataset=df, columns_list=numerical_Cols[:9], 
            rows=3, cols=3, suptitle='Distibution for each variable', size=(16,15), y=1.0)

## Logistic regression plots

In [None]:
sns.lmplot(x="xColName", y="yColName", data=df, logistic=True, y_jitter=.03)
plt.show()

## Scatter plots

In [None]:
sns.scatterplot(x='xColName', y='yColName', data=df)

### Scatter plots with Hues

In [None]:
sns.scatterplot(x=df['ColName'], y=df['colName2'], hue=df['HueColName'],palette="Set2")
plt.show()

## 3D visualization (for clusters, etc)

In [None]:
def PlotCluster3D(dataframe, col):
    fig = plt.figure(figsize=(10,8))
    ax = plt.subplot(111, projection='3d', label="bla")
    x = dataframe['PCA1']
    y = dataframe['PCA2']
    z = dataframe['PCA3']
    ax.scatter(x, y, z, s=40, c=dataframe[col], marker='o', cmap = 'viridis' )
    ax.set_title("The Plot Of The Clusters {}".format(col))
    plt.show()

In [None]:
PlotCluster3D(dataframe=df, col='KMeansColName')

# Skewness

## Skewness calculation

In [None]:
df['colName'].agg(['skew', 'kurtosis']).transpose()

## Box-Cox transformation for skewed columns

In [None]:
from scipy.stats import boxcox

# Box-Cox Transformation in Python
df.insert(len(df.columns), 'colName_BoxCox',  boxcox(df.iloc[:, 0])[0])

# Correlation analysis 

In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

## Heatmap


In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, cmap='YlGnBu')

## The point-Biserial result: 


In [None]:
from scipy import stats
stats.pointbiserialr(df['colName1'], df['colName2'])


## Highly correlated variables

In [None]:
upper_tri = df.corr().where(np.triu(np.ones(df.corr().shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > 0.70)]
print(to_drop)

## sorted absolute correlation columns with target

In [None]:
x = df.corr()
x = x['Target']
sortedCorrelation = x.reindex(x.abs().sort_values().index)
sortedCorrelation

## Two correlation heat maps next to each other

In [None]:
print("To the left, non-legendary")
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5))
attrib = ['hp','sp_attack','sp_defense','attack','defense','speed', 'base_happiness', 'capture_rate']
sns.heatmap((df[df['is_legendary']==0][attrib]).corr(), annot=True, ax=ax1, cmap='coolwarm')
sns.heatmap((df[df['is_legendary']==1][attrib]).corr(), annot=True, ax=ax2, cmap='coolwarm')
plt.figure(figsize=(20,8))
plt.show()


# Median values heatmap

In [None]:
#understanding powers
against = df.columns[df.columns.str.contains('against')]
speciality = df.groupby(['type1']).median()[against]
plt.figure(figsize=(20,8))
sns.heatmap(speciality, annot=True, cmap='coolwarm')

## Variance threshold

designed to drop columns with very low variance 

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thr = VarianceThreshold(threshold = 0.10) #Removing both constant and quasi-constant
var_thr.fit(df)

var_thr.get_support()

In [None]:
concol = [column for column in df.columns 
          if column not in df.columns[var_thr.get_support()]]

for features in concol:
    print(features)

## get variance of all columns

In [None]:
numerical_Cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_Cols.to_list():
    print(col, ": ", df[col].var())

# Outliers analysis

## Hampel filter

In [None]:
def hampel(dfr, col, threshold):
    med = dfr[col].median()
    print("Median: ", med)
    nullRows = dfr[dfr[col].isnull()]
    mad =  np.nanmedian(np.abs(dfr[col] - np.nanmedian(dfr[col])))
    print("MAD: ", mad)
    print("Low range: ", med - threshold*1.4826*mad)
    print("High range: ", med + threshold*1.4826*mad)
    dfc = dfr.copy()
    dfc = dfr[dfr[col]<(med + threshold*1.4826*mad)]
    dfc = dfc[dfc[col]>(med - threshold*1.4826*mad)]
    dfc = pd.merge(dfc, nullRows, how='outer')
    print("Number of outliers: ", dfr.shape[0] - dfc.shape[0])
    return dfc 


## IQR 

In [None]:
##IQR outliers 

def IQR_Outliers(dataframe, column, threshold):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (IQR * threshold)
    upper_bound = Q3 + (IQR * threshold)
    print("Lower bound ", lower_bound, "Upper bound: ", upper_bound)
    return dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]

# Clustering

## Determining Number of clusters

### Elbow method

In [None]:
from sklearn.cluster import KMeans
dist = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 48484)
    kmeans.fit(df)
    dist.append(kmeans.inertia_)
plt.plot(range(1, 10), dist,color = "red",marker = '*')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Distance')
plt.show()

### The Silehoutte method

In [None]:
from sklearn.metrics import silhouette_score 

def silhouette_scorer(dataframe, lowRange, highRange, title):
    silhouette_scores = []
    for i in range(lowRange, highRange):
        m1=KMeans(n_clusters=i, random_state=42)
        c = m1.fit_predict(dataframe)
        silhouette_scores.append(silhouette_score(dataframe, m1.fit_predict(dataframe))) 
    plt.bar(range(2,10), silhouette_scores) 
    plt.xlabel(title, fontsize = 20) 
    plt.ylabel('S(i)', fontsize = 20) 
    plt.show()
    return silhouette_scores

scoresScaled = silhouette_scorer(df, 2, 10, "scaled dataframe")
print("Scores Scaled: ", scoresScaled)


## Clustering algorithms

In [None]:
from sklearn.cluster import MeanShift, KMeans, AffinityPropagation, SpectralClustering, AgglomerativeClustering, DBSCAN

### Kmeans

In [None]:
def kmeansDF(nClusters, dataframe): 
    kmeans = KMeans(n_clusters=nClusters, random_state=42)
    c = kmeans.fit_predict(dataframe)
    return c

df['Kmeans'] = kmeansDF(4, df)


### Mean-Shift

In [None]:
from sklearn.cluster import estimate_bandwidth
def MeanShiftCluster(dataframe):
    bandwidth = estimate_bandwidth(dataframe, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(dataframe)
    return ms.labels_

In [None]:
df['MeanShift'] = MeanShiftCluster(df)


### DBScan 

In [None]:
def DBScan(dataframe):
    db = DBSCAN(eps=2000, min_samples=30).fit(dataframe)
    return db.labels_

In [None]:
df['DBScan'] = DBScan(df)


### Affinity propagation 

In [None]:
def AffinityPropagationCluster(dataframe):
    ap = AffinityPropagation(damping=0.95,preference=-1.0).fit(dataframe)
    return ap.labels_

In [None]:
df['AffinityPropagation'] = AffinityPropagationCluster(df)


### Spectral clustering

In [None]:
df['SpectralClustering'] = SpectralClustering(n_clusters=4, eigen_solver='arpack', affinity="nearest_neighbors").fit_predict(df)


### Agglomerative clustering

In [None]:
df['AgglomerativeClustering'] = AgglomerativeClustering(n_clusters=4, linkage='ward').fit_predict(df)


# Classification

# Dimensionality reduction 

## PCA and number of components choice

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.99)
pca.fit(df)
reduced = pca.transform(df)

plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 16, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 17, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.85, color='r', linestyle='-')
plt.text(0.5, 0.85, '85% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

## PCA and rename new columns


In [None]:
PCA_df = pd.DataFrame(reduced, index=df.index, columns=['PCA1', 'PCA2', 'PCA3', 'etc'])
PCA_df.head()

# Categorical encoding

In [None]:
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
def HandlingCategoricalFeatures(df):
  #We need to check RegionName and CouncilArea with respect to the Price. 
  #Suburb has become ordinal numbers (ranking)
  #check if there is any dependence between regionName and CouncilArea
  cols = [ 'regionname','CouncilArea', 'Method', 'Type']

  #Make them numbers first 
  le = OrdinalEncoder()
  le.fit(df[cols])
  df[cols] = le.transform(df[cols])
  df[cols] = df[cols] + 1
  for idx in cols:
    for idx2 in cols: 
      if idx == idx2:
        continue
      else:
        CrosstabResult=pd.crosstab(index=df[idx],columns=df[idx2])
        print(CrosstabResult)
        stat, p, dof, expected = chi2_contingency(CrosstabResult)
        # interpret p-value
        alpha = 0.05
        print("p value is " + str(p))
        if p <= alpha:
            print('Dependent (reject H0)')
            #Check the strength of the correlation
            val = [idx,idx2]
            chisq_stat = stats.chi2_contingency(df[val], correction=False)[0]
            # sample size
            n = np.sum(np.sum(df[cols]))
            # minimum of rows & columns
            minshape = min(df[cols].shape)-1
            # Cramer's v
            V_ = np.sqrt( (chisq_stat/n)/minshape)
            print(f"Cramer' V: {V_}")
        else:
            print('Independent (H0 holds true)')

  #Plot each one alone
  fig, axs = plt.subplots(2, 2, figsize = (15,10))
  a = 0
  b = 0
  for i in cols: 
    sns.boxplot(ax = axs[a,b],x=i, y='Price', data=df)
    if a >= 1:
      a = 0
      b = b + 1
    elif a<=0:
      a = a + 1
  #Plot it with respect to the Price 

# Feature engineering

## changing categorical column of two values to binary


In [None]:
df['_Gender'] = df['Gender'].eq('Male').mul(1)


## Datetime conversion 

In [None]:
df['Date'] = pd.to_datetime(df['Dt_String'])

## Ordinal encoding or categorical variables replacement

In [None]:
replaceEDict = {'Graduation': "Grad", 'PhD': "PostGrad", 'Master': "PostGrad", 'Basic':"UnderGrad", '2n Cycle': "PostGrad"}

df['Education'].replace(replaceEDict, inplace=True)

In [None]:
UtilDict = {'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 'ELO':1, 'None':0}
df['_Utilities'] = df['Utilities'].map(UtilDict)

## Scaling and Standardization

### Standard scaler

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
ScaledDF1 = scaler.transform(df)
ScaledDF1

### Min-Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
ScaledDF1 = scaler.transform(df)
ScaledDF1

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
LabelEncoderM = LabelEncoder()
df['colName'] = LabelEncoderM.fit_transform(df['colName'])


## BaseN encoding (useful for grouping or encoding a large number of categroical)

In [None]:
#I will not one-hot encode all of these, let's try binary encoder
import category_encoders as ce
import pandas as pd

#Create an object for Base N Encoding
encoder= ce.BaseNEncoder(cols=['MSSubClass'],return_df=True,base=3)

data_encoded=encoder.fit_transform(df)
data_encoded.head()

# Modeling 

## Train-test split

In [None]:
from sklearn.model_selection import train_test_split


Y = df['Target']
X = df[colsList]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

#get scores
print("Training score without scaling: ", logisticRegr.score(x_train, y_train))
print("Test score without scaling: ", logisticRegr.score(x_test, y_test))
print('\n\n')

## Naive-Bayes 


In [None]:
from sklearn.naive_bayes import GaussianNB

GaussianClassification = GaussianNB()
GaussianClassification.fit(x_train, y_train)

print("Training score: ", GaussianClassification.score(x_train, y_train))
print("Testing score: ",GaussianClassification.score(x_test, y_test))

## SVM 

In [None]:
from sklearn.svm import SVC

SVM = SVC(kernel='linear')
#SVM = SVC(kernel='poly')
#SVM = SVC(kernel='rbf')
#SVM = SVC(kernel='sigmoid')


SVM = SVM.fit(x_train, y_train)
predictions = SVM.predict(x_test)

print("Training score after scaling: ", SVM.score(x_train, y_train))
print("Testing score after scaling: ",SVM.score(x_test, y_test))

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfm = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=10)

rfm = rfm.fit(x_train, y_train)
predictions = rfm.predict(x_test)
print("Training score after scaling: ", rfm.score(x_train, y_train))
print("Testing score after scaling: ",rfm.score(x_test, y_test))

In [None]:
from sklearn.metrics import accuracy_score
predicted = rfm.predict(x_test)
accuracy = accuracy_score(y_test, predicted)
print(f'Out-of-bag score estimate: {rfm.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

## XGBoost

In [None]:
import xgboost as xgb
xg_reg = xgb.XGBClassifier(learning_rate = 0.1,
                max_depth = 15, n_estimators = 20)

xg_reg = xg_reg.fit(x_train, y_train)
predictions = xg_reg.predict(x_test)
print("Training score after scaling: ", xg_reg.score(x_train, y_train))
print("Testing score after scaling: ",xg_reg.score(x_test, y_test))


from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
cross_val_score(XGBClassifier(), X, Y)


### XGBoost feature importance

In [None]:
print(xg_reg.feature_importances_)

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot
plot_importance(xg_reg)
pyplot.show()

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=43)
clf.fit(x_train, y_train)

predictions = clf.predict(x_test)
print("Training score after scaling: ", clf.score(x_train, y_train))
print("Testing score after scaling: ",clf.score(x_test, y_test))

## KNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=15)
neigh.fit(x_train, y_train)

predictions = neigh.predict(x_test)
print("Training score after scaling: ", neigh.score(x_train, y_train))
print("Testing score after scaling: ",neigh.score(x_test, y_test))

# Tricks


## Reduce memory usage 

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

df = reduce_memory_usage(df, verbose=True)