# Startup Success - Exploratory Data Analysis (EDA)

Exploring and understanding our dataset before modeling.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('../data/startup data.csv')
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
df['status'].value_counts().plot(kind='bar', title='Success vs Failure')

In [None]:
df['status_binary'] = df['status'].map({'acquired': 1, 'closed': 0})

In [None]:
numeric_cols = df.select_dtypes(include='number')
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

## Class Imbalance Check

After defining the binary target variable `status_binary`, it's important to assess the balance between the classes. A highly imbalanced dataset can lead to biased models, especially when the majority class dominates the training process. We will visualize the distribution and compute the proportion of each class.

In [None]:
# Visualize the balance of the target variable
sns.countplot(data=df, x='status_binary')
plt.title("Class Distribution: status_binary")
plt.xticks([0, 1], ['Closed (0)', 'Acquired (1)'])
plt.show()

# Calculate the percentage distribution
class_distribution = df['status_binary'].value_counts(normalize=True) * 100
print("Class Distribution (%):")
print(class_distribution.round(2))