In [4]:
import mysql.connector
import os
import time
import chardet

In [5]:

# connect to the database
cnx = mysql.connector.connect(user='weather-data', password='weather-data',
                              host='localhost',
                              database='weather-data')

# create a cursor object
cursor = cnx.cursor()

create_table_query = '''CREATE TABLE IF NOT EXISTS weather_data (
                            id INT NOT NULL AUTO_INCREMENT,
                            station_id VARCHAR(255) NOT NULL,
                            date DATE NOT NULL,
                            Max_Temp FLOAT,
                            Min_Temp FLOAT,
                            Precipitation FLOAT,
                            PRIMARY KEY (id)
                       )'''
cursor.execute(create_table_query)

# close the cursor and database connection
cursor.close()
cnx.close()

# function to insert data into the database table
def insert_data(cursor, data):
    cursor.executemany(f"INSERT IGNORE INTO weather_data (station_id, date, Max_Temp, Min_Temp, Precipitation) VALUES (%s, %s, %s, %s, %s)", data)

start_time = time.time()
num_records_ingested = 0

# main function to ingest data from local text files into MySQL database
def ingest_data(local_path, db_host, db_user, db_password, db_name):
    # connect to the MySQL database
    cnx = mysql.connector.connect(host=db_host, user=db_user, password=db_password, database=db_name)
    cursor = cnx.cursor()

    # retrieve the list of files from the local directory
    file_list = os.listdir(local_path)

    # for each file, ingest data
    num_records_ingested = 0
    start_time = time.time()
    for file_name in file_list:
        # skip hidden files and directories
        if file_name.startswith('.'):
            continue

        # read the local file and parse contents into a list of tuples
        data = []
        with open(os.path.join(local_path, file_name), 'rb') as f:
            result = chardet.detect(f.read())
            encoding = result['encoding']
        with open(os.path.join(local_path, file_name), 'r',encoding=encoding) as f:
            for line in f:
                if not line:
                    continue
                fields = line.strip().split()
                if len(fields) < 4:
                    # handle case where line does not have expected number of fields
                    print(f"Error: line does not have expected number of fields: {line}")
                    fields += [None] * (4 - len(fields))
                try:
                    station_id = file_name.split('.')[0]
                    date = fields[0]
                    max_temp = float(fields[1]) if fields[1] else None
                    min_temp = float(fields[2]) if fields[2] else None
                    precipitation = float(fields[3]) if fields[3] else None
                    data.append((station_id, date, max_temp, min_temp, precipitation))
                except ValueError as e:
                    # handle case where a field cannot be converted to a float
                    print(f"Error: could not convert field to float: {e}")

        # insert data into the weather_data table
        insert_data(cursor, data)

        # increment the number of records ingested
        num_records_ingested += len(data)
        print(num_records_ingested)
              
    # commit changes to the database and close connection
    cnx.commit()
    cnx.close()

    # log the number of records ingested and the time taken
    end_time = time.time()
    print(f"Ingested {num_records_ingested} records in {end_time - start_time:.2f} seconds")

# example usage
ingest_data(local_path="wx_data", db_host="localhost", db_user="weather-data", db_password="weather-data", db_name="weather-data")



10957
20031
30411
41064
52021
62947
73904
84770
95727
106563
116150
126167
136821
147778
158705
169512
175599
186525
195260
203749
213824
224781
234643
245600
256465
267146
277981
288880
299837
310794
316270
327168
338064
349021
359428
370354
381159
392116
403043
413969
424926
431775
442732
453659
464586
475543
486500
497246
508083
519040
528936
539832
550789
561746
572125
583082
589353
600250
611148
622075
632482
643408
654216
663441
674398
683284
692841
703675
714571
725528
736089
744610
755567
766524
777390
788194
799090
810047
820946
829955
840883
851230
862187
873113
884009
894846
905713
916641
927598
938555
949153
960110
971067
981781
987503
998430
1009114
1019191
1030148
1040952
1051453
1062321
1073033
1082801
1093727
1104592
1115518
1126414
1137371
1148206
1159163
1170120
1179674
1190603
1201315
1212057
1222953
1231420
1242164
1253029
1263986
1274819
1285776
1296645
1307572
1318255
1329212
1340139
1350701
1360257
1371060
1382017
1390236
1401193
1410536
1416347
1426815
1435733
1