In [25]:
import pandas as pd
import os
from typing import Optional

In [31]:
def combine_csv_files(input_directory, output_file, id_column: Optional):
    """
    Reads all CSV files in a directory, combines them into one DataFrame,
    removes duplicate rows based on a specific ID column, and saves the result to a single CSV.

    Parameters:
        input_directory (str): Path to the directory containing the CSV files.
        output_file (str): Path to save the final combined CSV.
        id_column (str): The column name to identify duplicates.

    Returns:
        None
    """
    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]
    print(f"Found {len(csv_files)} CSV files in '{input_directory}'.")

    combined_df = pd.DataFrame()  # Initialize an empty DataFrame

    # Read and append each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(input_directory, csv_file)
        print(f"Reading {file_path}...")
        try:
            df = pd.read_csv(file_path)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    # Remove duplicate rows based on the specified ID column
    before_dedup = combined_df.shape[0]
    # combined_df = combined_df.drop_duplicates(subset=[id_column])
    combined_df = combined_df.drop_duplicates()
    after_dedup = combined_df.shape[0]
    #print(f"Removed {before_dedup - after_dedup} duplicate rows based on '{id_column}'.")
    print(f"Removed {before_dedup - after_dedup} duplicate rows.")

    # Save the combined DataFrame to a new CSV
    combined_df.to_csv(output_file, index=False)
    print(f"Combined data saved to '{output_file}'.")

In [32]:
input_directory = "data/location_data"
output_file = "data/combined_locations.csv"
#id_column = "Place ID"

combine_csv_files(input_directory, output_file, id_column)

Found 4 CSV files in 'data/location_data'.
Reading data/location_data/naturals_Chennai East_data.csv...
Reading data/location_data/naturals_Chennai West_data.csv...
Reading data/location_data/naturals_Chennai South_data.csv...
Reading data/location_data/naturals_Chennai North_data.csv...
Removed 91 duplicate rows.
Combined data saved to 'data/combined_locations.csv'.


In [33]:
# Example usage
input_directory = "data/review_data"  # Directory containing the 4 CSV files
output_file = "data/combined_reviews.csv"

combine_csv_files(input_directory, output_file, id_column)

Found 4 CSV files in 'data/review_data'.
Reading data/review_data/naturals_Chennai North_reviews.csv...
Reading data/review_data/naturals_Chennai South_reviews.csv...
Reading data/review_data/naturals_Chennai East_reviews.csv...
Reading data/review_data/naturals_Chennai West_reviews.csv...
Removed 455 duplicate rows.
Combined data saved to 'data/combined_reviews.csv'.


In [48]:
combined_locations = pd.read_csv("data/combined_locations.csv")

In [49]:
combined_locations

Unnamed: 0,Name,Address,Rating,Total Reviews,Place ID
0,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",4.8,2309,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
1,Naturals Signature Salon,"No.1, 2 223, above UNION BANK OF INDIA, Karpag...",4.8,3759,ChIJkZqNtEZdUjoRNidYNEyXDWw
2,Naturals Signature perungudi,"74, Church Rd, above Nilgiris Super Market, Ph...",4.7,362,ChIJGU34ybRdUjoRsa1guJtq2Sw
3,Naturals Signature Salon Ashok Nagar,"3/47, 11th Ave, opposite to KFC, Ashok Nagar, ...",4.8,858,ChIJ6cr6nWRnUjoRd8fxeM4MHQ0
4,Naturals Signature Salon,"New No 96, Old 139, Arcot Rd, above Federal Ba...",4.8,690,ChIJVTkIp_BhUjoRk6rtnXy3AWU
...,...,...,...,...,...
57,Naturals Salon,"2 & 3, 100 Feet Rd, Ambal Nagar, Ekkatuthangal...",4.7,1907,ChIJaVvU9jxnUjoRkh7iNmHaS9E
58,Naturals salon,"32, Arcot Rd, above Mcrennett, United India Co...",4.8,1545,ChIJv44gOPJmUjoRV6Gi1GZQOhs
59,Naturals Salon,"1st Floor, No 245, Avvai Shanmugam Salai, Gana...",4.6,790,ChIJpTzsnDdmUjoRZznlMkDIjkg
60,Naturals Salon,"38, Velachery Bypass Rd, above Dindukkal, Vija...",4.8,2015,ChIJq7BiIZBdUjoRkjOu4SwVNa0


In [45]:
combined_locations['Total Reviews'].sum()

np.int64(76740)

In [36]:
combined_locations[combined_locations['Address'] == "Ground Floor, Ceebros Enclave, No. 31, 1st Main Rd, Gandhi Nagar, Adyar, Chennai, Tamil Nadu 600020, India"]

Unnamed: 0,Name,Address,Rating,Total Reviews,Place ID
26,Naturals Salon,"Ground Floor, Ceebros Enclave, No. 31, 1st Mai...",4.2,802,ChIJT28vgJNnUjoRiPj-mGCrlOA


In [37]:
len(combined_locations)

62

In [46]:
combined_reviews = pd.read_csv("data/combined_reviews.csv")

In [47]:
combined_reviews

Unnamed: 0,Business Name,Address,Reviewer Name,Review,Rating,Review Date,Place ID
0,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",Aida Henry,Thank you Sakunthala for your great work and l...,5,2024-09-03 10:38:57,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
1,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",Sonia Kurup,Recently visited the branch for a cut and glob...,5,2024-11-29 17:41:35,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
2,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",bhavika Singh,excellent beautician kabita di. superb eyebrow...,5,2024-09-26 15:10:54,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
3,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",Mairiwiliu Newmai,Had a great experience at Naturals . The staff...,5,2024-09-11 13:38:54,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
4,Naturals Signature Salon,"No: G-38,No: 142 ,Lower Ground Floor, Velacher...",G V,Received Manicure and Pedicure service from s...,5,2024-09-10 11:35:27,ChIJ6aCWRmJnUjoRRmaT-wCYdLE
...,...,...,...,...,...,...,...
305,Naturals Salon,"New Door No.88, Old Door No.45, 46 & 46/1 Navi...",Ray Stargazing,Mrs.Sheeba was lovely! She did the haircut wel...,5,2024-10-16 14:21:44,ChIJUUOJu_tmUjoRL7BsVmdNZnw
306,Naturals Salon,"New Door No.88, Old Door No.45, 46 & 46/1 Navi...",malathi prasath,Had a relaxed experience on pedicure....servic...,5,2024-10-24 05:03:30,ChIJUUOJu_tmUjoRL7BsVmdNZnw
307,Naturals Salon,"New Door No.88, Old Door No.45, 46 & 46/1 Navi...",jayashree ganesan,"Did dandruff treatment, spa and hair cut at na...",5,2024-10-19 06:41:46,ChIJUUOJu_tmUjoRL7BsVmdNZnw
308,Naturals Salon,"New Door No.88, Old Door No.45, 46 & 46/1 Navi...",SriVaishnavi Balaji,Experiencing a very good service every time I ...,5,2024-12-03 10:44:57,ChIJUUOJu_tmUjoRL7BsVmdNZnw


In [42]:
import os

def consolidate_text_files(input_directory, output_file):
    """
    Consolidates all text files in a directory into a single text file, removing duplicate lines.

    Parameters:
        input_directory (str): Path to the directory containing text files.
        output_file (str): Path to save the consolidated text file.

    Returns:
        None
    """
    try:
        unique_lines = set()  # Set to store unique lines

        # Iterate through all files in the directory
        for filename in os.listdir(input_directory):
            if filename.endswith(".txt"):  # Only process .txt files
                file_path = os.path.join(input_directory, filename)
                print(f"Processing: {file_path}")

                # Read the contents of the text file
                with open(file_path, "r") as infile:
                    for line in infile:
                        unique_lines.add(line.strip())  # Add lines to the set (removes duplicates)

        # Write unique lines to the output file
        with open(output_file, "w") as outfile:
            for line in sorted(unique_lines):  # Sort for consistent order (optional)
                outfile.write(line + "\n")

        print(f"All text files have been consolidated into '{output_file}' with duplicates removed.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [43]:
# Example usage
input_directory = "data/location_data/maps_urls"  # Directory containing text files
output_file = "data/consolidated_urls.txt"  # Output consolidated text file
consolidate_text_files(input_directory, output_file)

Processing: data/location_data/maps_urls/Chennai North.txt
Processing: data/location_data/maps_urls/Chennai South.txt
Processing: data/location_data/maps_urls/Chennai West.txt
Processing: data/location_data/maps_urls/Chennai East.txt
All text files have been consolidated into 'data/consolidated_urls.txt' with duplicates removed.
