In [1]:
# Import pandas
import pandas as pd
# Import numpy
import numpy as np
# Import matplotlib
import matplotlib.pyplot as plt
# Import seaborn
import seaborn as sn
# Import stats
from scipy import stats
# Import ttest_ind
from scipy.stats import ttest_ind

In [3]:
# Reading the dataset
df = pd.read_csv("/Users/suleymanaygun/Desktop/untitled folder/train.csv", header= None)

In [None]:
# Set the max shown columns to unlimited
pd.set_option('display.max_columns', None)

In [None]:
# Calculate the percentage of missing values in each column of the df
null_percentage = df.isnull().sum() / len(df)

# Identify all the columns where the percentage of missing values is greater than 49%
null_cols = null_percentage[null_percentage > 0.49].index.tolist()

# Drop all the columns identified in the previous step from the df:
df = df.drop(null_cols, axis=1)

In [None]:
# Column & Row numbers
df.shape

In [None]:
# Calculate the percentage of missing values in each row of the df
null_percentage = df.isnull().sum(axis=1) / len(df.columns)

# Identify all the rows where the percentage of missing values is greater than 49%
null_rows = null_percentage[null_percentage > 0.49].index.tolist()

# Drop all the rows identified in the previous step from the df:
df = df.drop(null_rows, axis=0)

In [None]:
# Column & Row numbers
df.shape

In [None]:
plt.figure(figsize=(8, 8))
plt.boxplot(df[70])

# set axis labels and title
plt.xlabel('')
plt.ylabel('')
plt.title('Boxplot of Column 70')

# show plot
plt.show()

In [None]:
z_threshold = 3
# Loop through each column
for column in df.columns:
    # Check if the column is numerical
    if pd.api.types.is_numeric_dtype(df[column]):
        # Calculate the z-score for each data point in the column
        z_scores = np.abs(stats.zscore(df[column]))
        # Replace outliers with null values
        df[column][z_scores > z_threshold] = np.nan

In [None]:
# Distribution of types of data types in a column

df[45].value_counts()

In [None]:
# Distribution percentages of types of data types in a column

df[45].value_counts(normalize=True)

In [None]:
# Detect the null values for each column and distribute the null values to existing values according to their ratio:
for col_dist in df.columns:
    null_indices = df[col_dist].isnull()
    fill_values = df[col_dist].value_counts(normalize=True)
    df.loc[null_indices, col_dist] = np.random.choice(fill_values.index, size=null_indices.sum(), p=fill_values.values)

In [None]:
# Delete the row is there are more than 95% same value:
for col in df.columns:
    frequencies = df[col].value_counts()
    topfreq = frequencies.iloc[0]
    sum = len(df[col])
    similarity = topfreq / sum
    threshold = 0.95
    if similarity > threshold:
        df = df.drop(columns=[col])

In [None]:
# Column & Row numbers
df.shape

In [None]:
t_stat, p_value = ttest_ind(df[9], df[53])

if p_value > 0.85 or p_value < -0.85 :
    print("There is a significant difference in columns.")
else:
    print("There is no significant difference in columns.")

In [None]:
# Correlation Map

df.corr()
corr_matrix = df.corr()
sn.set (rc = {'figure.figsize':(30,30)})
sn.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
# To see the relationship between two column

fig, ax = plt.subplots(figsize=(8,8))
plt.scatter(df[80], df[109])
plt.xlabel('80')
plt.ylabel('109')
plt.title('Scatter Plot for 80 and 109 Columns')
plt.show()

In [None]:
# Drop the columns that have higher similarity than 0.78:

df.drop(35, axis=1, inplace=True)
df.drop(37, axis=1, inplace=True)
df.drop(66, axis=1, inplace=True)
df.drop(71, axis=1, inplace=True)
df.drop(72, axis=1, inplace=True)
df.drop(99, axis=1, inplace=True)
df.drop(104, axis=1, inplace=True)
df.drop(118, axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.head(20)

In [None]:
# Replacing the strings into numbers
df[5] = df[5].replace({'Y': 1, 'N': 0})
df[16] = df[16].replace({'Unad': 0, 'Famy': 1,'Spor': 2,'Chin': 3,'OthB': 4,'OthA': 5,'Groe': 6})
df[30] = df[30].replace({'Worg': 0, 'Come': 1,'Penr': 2,'Stat': 3,'Uned': 4,'Stut': 5,'Busn': 6,'Mate': 7})
df[31] = df[31].replace({'Hout': 0, 'Wits': 1,'Munt': 2,'Rent': 3,'Offt': 4,'Co-t': 5})
df[47] = df[47].replace({'Y': 1, 'N': 0})
df[67] = df[67].replace({'Secl': 0, 'Hign': 1,'Incr': 2,'Lowy': 3,'Acae': 4})
df[69] = df[69].replace({'M': 1, 'F': 0, 'XNA': 0})
df[93] = df[93].replace({'MONY': 0, 'TUEY': 1,'WEDY': 2,'THUY': 3,'FRIY': 4,'SATY': 5,'SUNY': 6})
df[103] = df[103].replace({'Cass': 1, 'Revs': 0})
df[117] = df[117].replace({'Mard': 0, 'Sind': 1,'Cive': 2,'Sepd': 3,'Widw': 4,'Unkn': 5})

In [None]:
# Calculates the percentages. If values are between 0 and 1: float, if not: int
for column in df.columns:
    if df[column].dtype == 'int64' or df[column].dtype == 'float64':

    # count the number of values in the column that are between 0 and 1
    count = ((df[column] > 0) & (df[column] < 1)).sum()

    # if the majority of values are between 0 and 1, convert the column to float
    if count > len(df[column]) / 2:
        df[column] = df[column].astype(float)
    else:
        df[column] = df[column].astype(int)

In [None]:
df.dtypes

In [None]:
# Decreasing the float numbers to 2 after the comma
df[[25, 53, 58, 89, 109]] = df[[25, 53, 58, 89, 109]].round(2)

In [None]:
df.head(20)

In [None]:
df.isna().sum()

In [None]:
# Correlation Map

df.corr()
corr_matrix = df.corr()
sn.set (rc = {'figure.figsize':(30, 30)})
sn.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
# Creating a new csv file with last version
df.to_csv('df.csv', index=False)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
# Histogram of the column 93
# Because we wanted to look at days distribution since we understood that this column was day column
plt.figure(figsize=(5, 4))
plt.hist(df[93], bins=7)
plt.xlabel('Days')
plt.ylabel('Count')
plt.title('Histogram of Days')
plt.show()

In [None]:
# Plot a bar chart of the 'Sex' column
plt.figure(figsize=(5, 4))
sn.countplot(x=69, data=df)
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Distribution of Sex')
plt.show()