# Read CSV data

In [23]:
import pandas as pd 
file_path = 'fortune500.csv'
dataframe_csv =pd.read_csv(file_path)

# Display the rows of the DataFrame. But it show an error because the file is damaged. Damaged data is the data that has less or more than 5 elements (columns) and some data is missing.
dataframe_csv 

ParserError: Error tokenizing data. C error: Expected 5 fields in line 34, saw 6


# Handle damaged CSV data

In [24]:
# Now we need to handle the damaged CSV data.
import csv

file_path = 'fortune500.csv'


index = 0
title = []
good_data = []
bad_data = []

with open(file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if index == 0:
            print(row)
            # First index (row) is the title in our csv file
            title = row
        index += 1
        # The row has 5 elements and not the first index (row), we can consider it as good data and add it to the good_data list
        if len(row) == 5 and index != 1:
            good_data.append(row)
        else:
            bad_data.append(row)

good_data

['#Year', '(1)Rank', '!Company', '(3)Revenue (in millions)', 'okjb)Profit (in millions)']


[['1955', '1', 'General Motors', '9823.5', '806'],
 ['1955', '2', 'Exxon Mobil', '5661.4', '584.8'],
 ['1955', '3', 'U.S. Steel', '3250.4', '195.4'],
 ['1955', '4', 'General Electric', '', '212.6'],
 ['1955', '5', 'Esmark', '2510.8', '19.1'],
 ['1955', '6', 'Chrysler', '2071.6', '18.5'],
 ['1955', '7', 'Armour', '2056.1', '1.6'],
 ['1955', '8', 'Gulf Oil', '1705.3', '182.8'],
 ['1955', '9', 'Mobil', '1703.6', '183.8'],
 ['1955', '10', 'DuPont', '1687.7', '344.4'],
 ['1955', '11', 'Amoco', '1667.4', '132.8'],
 ['1955', '12', 'Bethlehem Steel', '1660.3', '117.2'],
 ['1955', '13', 'CBS', '1631', '84.6'],
 ['1955', '14', 'Texaco', '1574.4', '226.1'],
 ['1955', '15', 'AT&T Technologies', '1526.2', '55.8'],
 ['1955', '16', 'Shell Oil', '1312.1', '121.1'],
 ['1955', '17', 'Kraft', '1210.3', '37.4'],
 ['1955', '18', 'ChevronTexaco', '1113.3', '211.9'],
 ['1955', '19', 'Goodyear Tire & Rubber', '1090.1', '48.1'],
 ['1955', '20', 'Boeing', '1033.2', '37'],
 ['1955', '21', 'Sinclair Oil', '1021.5

In [25]:
# Now the good data is in the good_data list, we can put it into a dataframe
csv_df = pd.DataFrame(good_data,columns = title)

# Display the DataFrame
csv_df

Unnamed: 0,#Year,(1)Rank,!Company,(3)Revenue (in millions),okjb)Profit (in millions)
0,1955,1,General Motors,9823.5,806
1,1955,2,Exxon Mobil,5661.4,584.8
2,1955,3,U.S. Steel,3250.4,195.4
3,1955,4,General Electric,,212.6
4,1955,5,Esmark,2510.8,19.1
...,...,...,...,...,...
28838,2009,996,Tellabs,1729.00,-930.1
28839,2009,997,Administaff,1724.40,45.8
28840,2009,998,Sanderson Farms,1723.60,-43.1
28841,2009,999,MGIC Investment,1721.50,-518.9


# Read JSON data

In [26]:
file_path = 'lines.json'
dataframe_json =pd.read_json(file_path)

# Display the rows of the DataFrame. But it show an error because the file is damaged.
dataframe_json

ValueError: Trailing data

# Handle damaged JSON data

In [27]:
import json

file_path = 'lines.json'
data = []
titles = ['Year', 'Rank', 'Company', 'Revenue (in millions)', 'Profit (in millions)'] 

with open(file_path, 'r') as file:
    for line in file:
        try:
            json_data = json.loads(line)
            # Check if the json_data is a dictionary
            for key in titles:
                # If the key is not in the json_data, add it with None value
                if key not in json_data:
                    json_data[key] = None
                    
            # Add the json_data to the data list
            data.append(json_data)

        except json.JSONDecodeError:
            print(f"Invalid JSON data: {line}")

Invalid JSON data: {"Year":1977,"Rank":,"Company":"Shell Oil","Revenue (in millions)":"9230","Profit (in millions)":"705.8"}

Invalid JSON data: {"Year":1977,"Rank":17,"Company":"Conoco","Revenue (in millions)":,"Profit (in millions)":"460"}

Invalid JSON data: {"Year":1977?,"Rank":20,"Company":"Tenneco Automotive","Revenue (in millions)":"6389.2","Profit (in millions)":"383.5"}

Invalid JSON data: {"Year":1977,"Rank":21,"Company":,"Revenue (in millions)":"6345.7","Profit (in millions)":"441.2"}

Invalid JSON data: {"Year":1977,"Rank":26,"Company":"Occidental Petroleum","Revenue (in millions)":,"Profit (in millions)":"183.7"}

Invalid JSON data: {"Year":1977,"Rank":43,"Company":,"Revenue (in millions)":"4086.8","Profit (in millions)":"136"}

Invalid JSON data: {"Year":1977aka,"Rank":381,"Company":"National Service Industries","Revenue (in millions)":"482.7","Profit (in millions)":"24.9"}

Invalid JSON data: {"Year":1977,"Rank":418,"Company":"Kellwood","Revenue (in millions)":,"Profit (

In [28]:
# Now the good data is in the good_data list, we can put it into a dataframe
json_df = pd.DataFrame(good_data,columns = title)

# Display the DataFrame
json_df

Unnamed: 0,#Year,(1)Rank,!Company,(3)Revenue (in millions),okjb)Profit (in millions)
0,1955,1,General Motors,9823.5,806
1,1955,2,Exxon Mobil,5661.4,584.8
2,1955,3,U.S. Steel,3250.4,195.4
3,1955,4,General Electric,,212.6
4,1955,5,Esmark,2510.8,19.1
...,...,...,...,...,...
28838,2009,996,Tellabs,1729.00,-930.1
28839,2009,997,Administaff,1724.40,45.8
28840,2009,998,Sanderson Farms,1723.60,-43.1
28841,2009,999,MGIC Investment,1721.50,-518.9


# Read txt data

In [29]:
file_path = 'unstructureddata.txt'  

# Function to process a block of text and return a dictionary
def process_block(block):
    entity = {}
    for item in block: 
        key, value = item.split(":")
        entity[key] = value
    return entity

# Read and process the file
current_data = []
result = []
with open(file_path,'r') as file:
    for line in file:
        # Your code
        if line.strip() == "":
            dfcontent = process_block(current_data)
            result.append(dfcontent)
            current_data = []
        else:
            current_data.append(line.strip())

# Print the first 3 rows of the result
print(result[:3])

[{'Year': ' 1998', 'Rank': ' 401', 'Company': ' Turner Corp.', 'Revenue (in millions)': ' 3639.8', 'Profit (in millions)': ' 5.9'}, {'Year': ' 1998', 'Rank': ' 402', 'Company': ' Reebok International', 'Revenue (in millions)': ' 3637.4', 'Profit (in millions)': ' 135.1'}, {'Year': ' 1998', 'Rank': ' 403', 'Company': ' Morton International', 'Revenue (in millions)': ' 3636.5', 'Profit (in millions)': ' 343'}]


# Handle damaged txt data

In [30]:
file_path = 'unstructureddata.txt'

expected_columns = ['Year', 'Rank', 'Company', 'Revenue (in millions)', 'Profit (in millions)']

current_data = []
result = []

# Function to process a block of text and return a dictionary
def process_block_with_damaged(block):
    entity = {key: None for key in expected_columns}
    #Your code:
    for item in block:
        if ": " in item:
            key, value = item.split(": ")
            if key in entity:
                entity[key] = value
    
    return entity

with open(file_path, 'r')as file:
    for line in file:
        if line.strip() == "":
            processed_data = process_block_with_damaged(current_data)
            result.append(processed_data)
            current_data = []
        else:
            current_data.append(line.strip())

In [31]:
# Now the good data is in the result list, we can put it into a dataframe
txt_df = pd.DataFrame(result)

# Display the DataFrame
txt_df

Unnamed: 0,Year,Rank,Company,Revenue (in millions),Profit (in millions)
0,1998,401,Turner Corp.,3639.8,5.9
1,1998,402,Reebok International,3637.4,135.1
2,1998,403,Morton International,3636.5,343
3,1998,404,Engelhard,3630.7,47.8
4,1998,405,Temple-Inland,3625.4,50.8
...,...,...,...,...,...
7595,2009,996,Tellabs,1729.00,-930.1
7596,2009,997,Administaff,1724.40,45.8
7597,2009,998,Sanderson Farms,1723.60,-43.1
7598,2009,999,MGIC Investment,1721.50,-518.9


# Data integration, integrating all three types of data to Results_Combine

In [38]:
# Write the combined data to a new CSV file
output_path = 'Results_Combine.csv'

# Write the combined data to a new CSV file
with open(output_path, 'w', encoding='utf-8') as f:
    # Write the JSON data to the file
    f.write(json_df.to_csv(index=False, header=False))
    # Write the TXT data to the file
    f.write(txt_df.to_csv(index=False))
    # Write the CSV data to the file
    f.write(csv_df.to_csv(index=False))
