In [14]:
import os
import pandas as pd

def remove_null_rows(input_folder, output_file, columns_to_drop):
    # Initialize an empty list to store DataFrames
    dfs = []
    
    # Iterate over files in the input folder
    for file in os.listdir(input_folder):
        if file.endswith(".xlsx") or file.endswith(".xls"):
            # Print the name of the current file
            print("Processing file:", file)
            
            # Read each Excel file into a DataFrame and append to the list
            df = pd.read_excel(os.path.join(input_folder, file))
            df.drop(columns=columns_to_drop, inplace=True)
            dfs.append(df)
    
    # Print the header of the first file
    if dfs:
        print("\nHeader of the first file:")
        print(dfs[0].head())
    
    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Drop rows with any null or blank values
    combined_df.dropna(axis=0, how='any',inplace=True)
    
    # Remove rows with non-character values at the beginning of "Petitior City" column
    combined_df = combined_df[combined_df['Petitioner City'].str.match(r'^[^0-9-]')]
    
    # Save the cleaned DataFrame to a new Excel file
    combined_df.to_excel(output_file, index=False)
    
    print("\nNull rows removed. Cleaned data saved to", output_file)

In [15]:
input_folder = "/kaggle/input/uscis-h1b-data-2009-2023/USCIS H1B Data"
output_file = "cleaned_data.xlsx"
columns_to_drop = ["Line by line"]
remove_null_rows(input_folder, output_file, columns_to_drop)

Processing file: 2013-2014.xlsx
Processing file: 2009-2010.xlsx
Processing file: 2022.xlsx
Processing file: 2023.xlsx
Processing file: 2015-2016.xlsx
Processing file: 2019 - 2020.xlsx
Processing file: 2011-2012.xlsx
Processing file: 2021.xlsx
Processing file: 2017-2018.xlsx

Header of the first file:
   Fiscal Year    Employer (Petitioner) Name  Tax ID  \
0            2014                        NaN   530.0   
1            2014                        NaN  1886.0   
2            2014                        NaN  2683.0   
3            2014                        NaN  3096.0   
4            2014                        NaN  3720.0   

                               Industry (NAICS) Code Petitioner City  \
0    11 - Agriculture, Forestry, Fishing and Hunting          TOWNER   
1  54 - Professional, Scientific, and Technical S...   BUFFALO GROVE   
2                                                NaN       ARLINGTON   
3                         52 - Finance and Insurance         FAIRFAX   
4

True
