In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("layoffs_data.csv")



In [8]:
print("Initial shape:", df.shape)

Initial shape: (3642, 12)


In [9]:
print("\nSample data:\n", df.head())


Sample data:
        Company    Location_HQ   Industry  Laid_Off_Count        Date  \
0          Oda           Oslo       Food           150.0  2024-06-05   
1       Pagaya       Tel Aviv    Finance           100.0  2024-06-05   
2  Aleph Farms       Tel Aviv       Food            30.0  2024-06-05   
3      MoonPay          Dover     Crypto            30.0  2024-06-05   
4         Yext  New York City  Marketing             NaN  2024-06-05   

                                              Source  Funds_Raised     Stage  \
0  https://techcrunch.com/2024/06/05/softbank-bac...         691.0   Unknown   
1  https://www.calcalistech.com/ctechnews/article...        2000.0  Post-IPO   
2  https://www.calcalistech.com/ctechnews/article...         119.0   Unknown   
3  https://www.theblock.co/post/298638/moonpay-la...         651.0   Unknown   
4  https://www.investing.com/news/economy-news/ye...         117.0  Post-IPO   

            Date_Added        Country  Percentage List_of_Employees_Lai

In [10]:
print("\nInfo:")
print(df.info())


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Company                     3642 non-null   object 
 1   Location_HQ                 3642 non-null   object 
 2   Industry                    3642 non-null   object 
 3   Laid_Off_Count              2389 non-null   float64
 4   Date                        3642 non-null   object 
 5   Source                      3642 non-null   object 
 6   Funds_Raised                3252 non-null   float64
 7   Stage                       3642 non-null   object 
 8   Date_Added                  3642 non-null   object 
 9   Country                     3642 non-null   object 
 10  Percentage                  2342 non-null   float64
 11  List_of_Employees_Laid_Off  3642 non-null   object 
dtypes: float64(3), object(9)
memory usage: 341.6+ KB
None


In [11]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Company                          0
Location_HQ                      0
Industry                         0
Laid_Off_Count                1253
Date                             0
Source                           0
Funds_Raised                   390
Stage                            0
Date_Added                       0
Country                          0
Percentage                    1300
List_of_Employees_Laid_Off       0
dtype: int64


In [12]:
df.columns = (df.columns
                .str.strip()                 # remove leading/trailing spaces
                .str.lower()                 # lowercase
                .str.replace(" ", "_")       # replace spaces with _
                .str.replace("(", "")
                .str.replace(")", "")
             )

print("\nRenamed columns:", df.columns.tolist())


Renamed columns: ['company', 'location_hq', 'industry', 'laid_off_count', 'date', 'source', 'funds_raised', 'stage', 'date_added', 'country', 'percentage', 'list_of_employees_laid_off']


In [13]:
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

Duplicates before: 0
Duplicates after: 0


In [14]:
num_cols = df.select_dtypes(include=[np.number]).columns
for c in num_cols:
    if df[c].isnull().sum() > 0:
        df[c].fillna(df[c].median(), inplace=True)

# Categorical → fill with "Unknown"
cat_cols = df.select_dtypes(include=['object']).columns
for c in cat_cols:
    if df[c].isnull().sum() > 0:
        df[c].fillna("Unknown", inplace=True)

print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
company                       0
location_hq                   0
industry                      0
laid_off_count                0
date                          0
source                        0
funds_raised                  0
stage                         0
date_added                    0
country                       0
percentage                    0
list_of_employees_laid_off    0
dtype: int64


In [16]:
print("\nFinal shape:", df.shape)
print(df.info())


Final shape: (3642, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   company                     3642 non-null   object        
 1   location_hq                 3642 non-null   object        
 2   industry                    3642 non-null   object        
 3   laid_off_count              3642 non-null   float64       
 4   date                        1560 non-null   datetime64[ns]
 5   source                      3642 non-null   object        
 6   funds_raised                3642 non-null   float64       
 7   stage                       3642 non-null   object        
 8   date_added                  3642 non-null   object        
 9   country                     3642 non-null   object        
 10  percentage                  3642 non-null   float64       
 11  list_of_employees_laid_off  364

In [17]:
df.to_csv("cleaned_dataset.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_dataset.csv")

✅ Cleaned dataset saved as cleaned_dataset.csv
