In [3]:
import boto3
import pyarrow.parquet as pq
import pandas as pd
from io import BytesIO

def read_parquet_from_s3(access_key, secret_key, bucket_name, file_key, region, endpoint_url):
    session = boto3.Session(
        aws_access_key_id=access_key,
        aws_secret_access_key=secret_key,
        region_name=region
    )

    s3 = session.client('s3', endpoint_url=endpoint_url)
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    file_stream = BytesIO(response['Body'].read())
    table = pq.read_table(file_stream)
    df = table.to_pandas()
    
    return df

# Read the Parquet file from S3
df = read_parquet_from_s3(access_key, secret_key, bucket_name, file_key, region, endpoint_url)
df.rename(columns={'postalCode':'postal_code','currentUse':'sector'},inplace=True)
df.drop(columns=['quantilesBuildingAreaAboveGround','quantilesBuildingAreaBelowGround','quantilesBuildingArea','quantilesDwellingArea','averageDwellingArea','quantilesBuildingUnitArea','quantilesYearConstruction'],inplace=True)
df

In [46]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

postalcodes_path = '/home/eouser/Desktop/SpanishPostalCodes/Catalonia/postal_codes.shp'
gdf = gpd.read_file(postalcodes_path)
gdf = gdf.to_crs(epsg=4326)
print(f"Number of postal codes: {len(gdf['COD_POSTAL'].unique())}")
gdf= gdf.rename(columns={'COD_POSTAL':'postal_code'})
gdf['centroid'] = gdf.geometry.centroid

Number of postal codes: 1137



  gdf['centroid'] = gdf.geometry.centroid


In [48]:
merge = gdf.merge(df,how='right',on='postal_code')
merge['latitude'] = gdf['centroid'].y
merge['longitude'] = gdf['centroid'].x 
merge.drop(columns=['centroid','CODIGO_INE','ALTA_DB','ID_CP','geometry'])

Unnamed: 0,postal_code,sector,conditionOfConstruction,totalBuiltAreaAboveGround,averageBuildingAreaAboveGround,totalBuiltAreaBelowGround,averageBuildingAreaBelowGround,totalBuiltArea,averageBuildingArea,averageBuildingUnitArea,averageYearConstruction,dwellings,buildingUnits,buildings,buildingsShare,latitude,longitude
0,08001,1_residential,functional,2.008946e+06,1061.193127,99963.704346,0.000000,2108910.0,1088.961803,217.641722,1896.0,16899.0,21310.0,1682,87.832898,41.379789,2.169247
1,08001,3_industrial,functional,2.447000e+03,815.666667,0.000000,0.000000,2447.0,815.666667,148.479167,1920.0,12.0,18.0,3,0.156658,41.382444,2.175537
2,08001,4_1_office,functional,1.035661e+05,3884.473586,30388.853271,222.137085,133955.0,4545.952381,1180.464399,1947.0,33.0,542.0,23,1.201044,41.384276,2.186578
3,08001,4_2_retail,functional,2.980151e+05,1918.385967,31657.889893,65.568530,329673.0,2105.085271,1144.906400,1920.0,57.0,376.0,137,7.154047,41.372022,2.157608
4,08001,4_3_publicServices,functional,3.014470e+05,3802.631022,28459.978516,184.042907,329907.0,4191.196970,2864.757974,1951.0,20.0,107.0,70,3.655352,41.397270,2.202858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6102,43897,2_agriculture,functional,3.931300e+04,246.567308,0.000000,0.000000,39313.0,246.567308,157.062500,1979.0,2.0,113.0,110,14.608234,,
6103,43897,3_industrial,functional,3.116117e+05,1142.384601,829.299561,0.000000,312441.0,1145.157233,754.862709,1985.0,0.0,208.0,169,22.443559,,
6104,43897,4_1_office,functional,1.997000e+03,499.250000,0.000000,0.000000,1997.0,499.250000,499.250000,1952.0,0.0,4.0,4,0.531208,,
6105,43897,4_2_retail,functional,1.550061e+03,387.515198,447.939209,111.984802,1998.0,499.500000,387.515198,1982.0,0.0,4.0,4,0.531208,,


In [77]:
df[(df['postal_code'] == '08104')]['averageYearConstruction'].iloc[2] = 1940

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[(df['postal_code'] == '08104')]['averageYearConstruction'].iloc[2] = 1940
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [75]:
df[(df['postal_code'] == '08587')]['averageYearConstruction'].iloc[3] = 1950

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[(df['postal_code'] == '08587')]['averageYearConstruction'].iloc[3] = 1950
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [79]:
df.loc[(df['postal_code'] == '17123') & (df.index == df[(df['postal_code'] == '17123')].index[3]), 'averageYearConstruction'] = 1980

In [80]:
df.loc[(df['postal_code'] == '17171') & (df.index == df[df['postal_code'] == '17171'].index[3]), 'averageYearConstruction'] = 1950
df.loc[(df['postal_code'] == '08587') & (df.index == df[df['postal_code'] == '08587'].index[3]), 'averageYearConstruction'] = 1950
df.loc[(df['postal_code'] == '08104') & (df.index == df[df['postal_code'] == '08104'].index[2]), 'averageYearConstruction'] = 1940
df.loc[(df['postal_code'] == '17171') & (df.index == df[df['postal_code'] == '17171'].index[3]), 'averageYearConstruction'] = 1950
df.loc[(df['postal_code'] == '17444') & (df.index == df[df['postal_code'] == '17444'].index[3]), 'averageYearConstruction'] = 1970
df.loc[(df['postal_code'] == '17745') & (df.index == df[df['postal_code'] == '17745'].index[3]), 'averageYearConstruction'] = 1959
df.loc[(df['postal_code'] == '43410') & (df.index == df[df['postal_code'] == '43410'].index[4]), 'averageYearConstruction'] = 1945


In [29]:
import numpy as np
import pandas as pd
import gstools as gs
import matplotlib.pyplot as plt


# Separate known and unknown data
known_data = df[df['averageYearConstruction'].notnull()]
unknown_data = df[df['averageYearConstruction'].isnull()]

# Define the Gaussian variogram model
model = gs.Gaussian(dim=2, var=1, len_scale=0.01)

# Create Kriging object
kriging = gs.krige.Krige(known_data[['longitude', 'latitude']].values, known_data['value'].values, model)

# Perform Kriging to estimate values at unknown locations
predicted_values = kriging(unknown_data[['longitude', 'latitude']].values)

# Fill in the missing values in the original DataFrame
df.loc[df['value'].isnull(), 'value'] = predicted_values

# Display the imputed DataFrame
print(df)

# Optional: Visualize the results
plt.scatter(known_data['longitude'], known_data['latitude'], c='blue', label='Known', marker='o')
plt.scatter(unknown_data['longitude'], unknown_data['latitude'], c='red', label='Unknown (Imputed)', marker='x')
plt.colorbar()
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Kriging Interpolation')
plt.legend()
plt.show()


Unnamed: 0,postal_code,sector,conditionOfConstruction,totalBuiltAreaAboveGround,averageBuildingAreaAboveGround,totalBuiltAreaBelowGround,averageBuildingAreaBelowGround,totalBuiltArea,averageBuildingArea,averageBuildingUnitArea,averageYearConstruction,dwellings,buildingUnits,buildings,buildingsShare
0,08001,1_residential,functional,2.008946e+06,1061.193127,99963.704346,0.000000,2108910.0,1088.961803,217.641722,1896.0,16899.0,21310.0,1682,87.832898
1,08001,3_industrial,functional,2.447000e+03,815.666667,0.000000,0.000000,2447.0,815.666667,148.479167,1920.0,12.0,18.0,3,0.156658
2,08001,4_1_office,functional,1.035661e+05,3884.473586,30388.853271,222.137085,133955.0,4545.952381,1180.464399,1947.0,33.0,542.0,23,1.201044
3,08001,4_2_retail,functional,2.980151e+05,1918.385967,31657.889893,65.568530,329673.0,2105.085271,1144.906400,1920.0,57.0,376.0,137,7.154047
4,08001,4_3_publicServices,functional,3.014470e+05,3802.631022,28459.978516,184.042907,329907.0,4191.196970,2864.757974,1951.0,20.0,107.0,70,3.655352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6102,43897,2_agriculture,functional,3.931300e+04,246.567308,0.000000,0.000000,39313.0,246.567308,157.062500,1979.0,2.0,113.0,110,14.608234
6103,43897,3_industrial,functional,3.116117e+05,1142.384601,829.299561,0.000000,312441.0,1145.157233,754.862709,1985.0,0.0,208.0,169,22.443559
6104,43897,4_1_office,functional,1.997000e+03,499.250000,0.000000,0.000000,1997.0,499.250000,499.250000,1952.0,0.0,4.0,4,0.531208
6105,43897,4_2_retail,functional,1.550061e+03,387.515198,447.939209,111.984802,1998.0,499.500000,387.515198,1982.0,0.0,4.0,4,0.531208


In [None]:
df_null = merge.where(pd.notnull(df), None)
df_null

In [81]:
import psycopg2
import yaml
from tqdm import tqdm
import polars as pl
import pandas as pd
import numpy as np

with open('/home/eouser/Desktop/DEDL/credentials.yaml', 'r') as f:
    c = yaml.safe_load(f)["postgres"]
    

conn = psycopg2.connect(f"dbname={c['db_name']} user={c['db_user']} password={c['db_password']} host={c['db_host']} port={c['db_port']} sslmode=require")
cursor = conn.cursor()


dtype_map = {
    'object': 'VARCHAR',
    'int64': 'INTEGER',
    'float64': 'FLOAT',
    'datetime64[ns]': 'TIMESTAMP',
    'datetime64[us]': 'TIMESTAMP',
}


def check_table_exists(table_name):
    query = f"""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'public' AND table_name = '{table_name}'
    """
    cursor.execute(query)
    return cursor.fetchone()[0] 
    

    
def upload_data(df, table_name):
	if isinstance(df, pl.DataFrame):
		df = df.to_pandas()
	print(len(df))
            
	if not check_table_exists(table_name):
		columns = []
		for col_name, dtype in df.dtypes.items():
			if dtype == 'object':
				if df[col_name].apply(lambda x: isinstance(x,np.ndarray)).any():
					pg_type = 'ARRAY' 
					print(col_name)
					df[col_name] = df[col_name].apply(lambda x: x.tolist())
				else:
					pg_type = 'VARCHAR'
			else:
				pg_type = dtype_map.get(str(dtype), 'VARCHAR')

			columns.append(f"{col_name} {pg_type}")

		create_table_query = f"""
		CREATE TABLE {table_name} (
			{', '.join(columns)},
			PRIMARY KEY (sector, postal_code)
		);
		CREATE INDEX ON {table_name} (sector);
		"""
		print(create_table_query)
		cursor.execute(create_table_query)
		conn.commit()

	insert_query = f"""
	INSERT INTO {table_name} ({','.join(df.columns)})
	VALUES ({','.join(['%s'] * len(df.columns))});
	"""
	print(insert_query)
	for x in tqdm(df.to_numpy(), desc=f"Inserting {table_name} rows", unit="rows"):
		print(x)
		cursor.execute(insert_query,tuple(x))
	conn.commit()


In [82]:
upload_data(df,'cadaster')

6107

		CREATE TABLE cadaster (
			postal_code VARCHAR, sector VARCHAR, conditionOfConstruction VARCHAR, totalBuiltAreaAboveGround FLOAT, averageBuildingAreaAboveGround FLOAT, totalBuiltAreaBelowGround FLOAT, averageBuildingAreaBelowGround FLOAT, totalBuiltArea FLOAT, averageBuildingArea FLOAT, averageBuildingUnitArea FLOAT, averageYearConstruction FLOAT, dwellings FLOAT, buildingUnits FLOAT, buildings INTEGER, buildingsShare FLOAT,
			PRIMARY KEY (sector, postal_code)
		);
		CREATE INDEX ON cadaster (sector);
		

	INSERT INTO cadaster (postal_code,sector,conditionOfConstruction,totalBuiltAreaAboveGround,averageBuildingAreaAboveGround,totalBuiltAreaBelowGround,averageBuildingAreaBelowGround,totalBuiltArea,averageBuildingArea,averageBuildingUnitArea,averageYearConstruction,dwellings,buildingUnits,buildings,buildingsShare)
	VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
	


Inserting cadaster rows:   8%|▊         | 481/6107 [00:00<00:01, 4806.18rows/s]

['08001' '1_residential' 'functional' 2008946.2956542969
 1061.1931273114628 99963.70434570312 0.0 2108910.0 1088.96180338134
 217.6417215560584 1896.0 16899.0 21310.0 1682 87.83289817232375]
['08001' '3_industrial' 'functional' 2447.0 815.6666666666666 0.0 0.0
 2447.0 815.6666666666666 148.47916666666666 1920.0 12.0 18.0 3
 0.1566579634464752]
['08001' '4_1_office' 'functional' 103566.14672851564 3884.473586309523
 30388.85327148436 222.1370849609375 133955.0 4545.952380952381
 1180.4643990181896 1947.0 33.0 542.0 23 1.2010443864229765]
['08001' '4_2_retail' 'functional' 298015.1101074219 1918.3859674024952
 31657.889892578132 65.56853007494918 329673.0 2105.0852713178297
 1144.906400106137 1920.0 57.0 376.0 137 7.154046997389034]
['08001' '4_3_publicServices' 'functional' 301447.021484375
 3802.6310221354165 28459.978515625 184.04290674603175 329907.0
 4191.19696969697 2864.7579744599557 1951.0 20.0 107.0 70
 3.6553524804177546]
['08002' '1_residential' 'functional' 1251495.965576171

Inserting cadaster rows:  24%|██▍       | 1456/6107 [00:00<00:00, 4785.41rows/s]

['08397' '3_industrial' 'functional' 201958.44091796875 778.7804701721784
 19057.55908203125 0.0 221016.0 839.4466019417475 587.1176818049516 1982.0
 2.0 371.0 218 4.966962861699704]
['08397' '4_1_office' 'functional' 8101.2099609375 900.1344401041666
 2011.7900390625 223.5322265625 10113.0 1123.6666666666667
 819.0788845486111 1975.0 1.0 11.0 9 0.2050580997949419]
['08397' '4_2_retail' 'functional' 259210.173828125 1802.4008969907406
 9252.826171875 8.110911649816176 268463.0 1886.6851851851852
 680.0395221403404 1965.0 103.0 506.0 114 2.5974025974025974]
['08397' '4_3_publicServices' 'functional' 66055.2861328125
 1135.571437669837 1983.7138671875 0.0 68039.0 1178.695652173913
 929.2392123909884 1975.0 0.0 54.0 50 1.1392116655274551]
['08398' '1_residential' 'functional' 306393.38037109375
 211.00585344664046 45073.61962890625 19.12708003259761 351467.0
 246.4377067254686 165.98110901072505 1988.0 2001.0 2511.0 954
 77.5609756097561]
['08398' '2_agriculture' 'functional' 31228.803222

Inserting cadaster rows:  40%|███▉      | 2419/6107 [00:00<00:00, 4784.94rows/s]

['08840' '1_residential' 'functional' 3176106.2958984375 607.675181665665
 531131.7041015625 20.24031311084738 3707238.0 683.3005380476557
 147.24551869092357 1974.0 24988.0 41078.0 4104 86.36363636363636]
['08840' '2_agriculture' 'functional' 30959.0 235.38181818181818 0.0 0.0
 30959.0 235.38181818181818 158.15346534653466 1975.0 2.0 119.0 114
 2.398989898989899]
['08840' '3_industrial' 'functional' 490515.37158203125 1146.0250070830923
 22352.628417968743 0.0 512868.0 1193.0586419753085 775.1712804329209
 1983.0 7.0 372.0 342 7.196969696969697]
['08840' '4_1_office' 'functional' 130223.61816406249 3026.1508316532254
 23017.38183593751 32.974104256465516 153241.0 3431.2580645161293
 732.3113103850507 1975.0 1.0 199.0 33 0.6944444444444444]
['08840' '4_2_retail' 'functional' 165187.80639648438 1045.6151044291835
 104462.19360351562 12.825340141684322 269650.0 1180.032258064516
 356.6602761917075 1967.0 27.0 266.0 66 1.3888888888888888]
['08840' '4_3_publicServices' 'functional' 300619.

Inserting cadaster rows:  56%|█████▌    | 3405/6107 [00:00<00:00, 4866.58rows/s]

['17491' '4_2_retail' 'functional' 14598.56640625 2085.5094866071427
 2445.43359375 349.34765625 17044.0 2434.8571428571427 1775.1166294642858
 1870.0 3.0 16.0 7 0.9102730819245773]
['17491' '4_3_publicServices' 'functional' 9325.71728515625
 1036.1908094618057 1582.28271484375 175.80919053819446 10908.0 1212.0
 1024.524142795139 1930.0 0.0 10.0 9 1.1703511053315996]
['17492' '1_residential' 'functional' 54667.04248046875 455.1751697132888
 168.95751953125 0.0 54836.0 456.81553398058253 362.0899208470395 1933.0
 108.0 114.0 109 59.89010989010989]
['17492' '2_agriculture' 'functional' 45306.0 762.5714285714286 0.0 0.0
 45306.0 762.5714285714286 589.3888888888889 1983.0 4.0 58.0 53
 29.120879120879124]
['17492' '3_industrial' 'functional' 6564.0 547.0 0.0 0.0 6564.0 547.0
 485.0416666666667 1936.0 0.0 14.0 12 6.593406593406594]
['17492' '4_2_retail' 'functional' 2170.0 723.3333333333334 0.0 0.0 2170.0
 723.3333333333334 370.5 1970.0 0.0 5.0 3 1.6483516483516485]
['17492' '4_3_publicServi

Inserting cadaster rows:  72%|███████▏  | 4397/6107 [00:00<00:00, 4917.90rows/s]

['25218' '4_1_office' 'functional' 15883.0 3970.75 0.0 0.0 15883.0 3970.75
 2028.75 1986.0 0.0 6.0 4 1.1869436201780417]
['25218' '4_2_retail' 'functional' 7281.171142578125 1456.234228515625
 388.828857421875 77.765771484375 7670.0 1534.0 1037.9171142578125 1968.0
 0.0 6.0 5 1.483679525222552]
['25218' '4_3_publicServices' 'functional' 1206.0 150.75 0.0 0.0 1206.0
 150.75 100.375 1951.0 0.0 14.0 8 2.3738872403560833]
['25220' '1_residential' 'functional' 323823.8291015625 374.81792758637096
 10753.1708984375 0.0 334577.0 382.30040595399186 305.2298998301465 1953.0
 983.0 1257.0 779 76.0]
['25220' '2_agriculture' 'functional' 141147.0 844.976 0.0 0.0 141147.0
 844.976 569.895652173913 1976.0 0.0 136.0 133 12.975609756097562]
['25220' '3_industrial' 'functional' 104904.85522460938 720.7592959196671
 264.144775390625 0.0 105169.0 723.6304347826087 520.9518915313521 1970.0
 1.0 107.0 98 9.560975609756097]
['25220' '4_1_office' 'functional' 114.0 114.0 0.0 0.0 114.0 114.0 114.0
 1933.0 0.0

Inserting cadaster rows:  88%|████████▊ | 5379/6107 [00:01<00:00, 4884.99rows/s]

['25750' '4_3_publicServices' 'functional' 286.484130859375
 286.484130859375 193.515869140625 193.515869140625 480.0 480.0
 286.484130859375 1950.0 0.0 1.0 1 0.1984126984126984]
['25751' '1_residential' 'functional' 17643.0 286.96 0.0 0.0 17643.0
 286.96 224.9673913043478 1901.0 47.0 62.0 54 39.705882352941174]
['25751' '2_agriculture' 'functional' 41086.697998046875 606.4705882352941
 65.302001953125 0.0 41152.0 606.4705882352941 465.0425531914894 1971.0
 2.0 60.0 55 40.44117647058824]
['25751' '3_industrial' 'functional' 6151.0 307.55 0.0 0.0 6151.0 307.55
 171.725 1974.0 0.0 38.0 20 14.705882352941178]
['25751' '4_2_retail' 'functional' 673.0 336.5 0.0 0.0 673.0 336.5 227.0
 1900.0 0.0 3.0 2 1.4705882352941175]
['25751' '4_3_publicServices' 'functional' 1119.0 223.8 0.0 0.0 1119.0
 223.8 155.5 1916.0 0.0 7.0 5 3.6764705882352944]
['25752' '1_residential' 'functional' 18995.620361328125
 227.37182304186697 303.379638671875 0.0 19299.0 228.10256410256412
 192.80745729258362 1903.0 67

Inserting cadaster rows: 100%|██████████| 6107/6107 [00:01<00:00, 4837.40rows/s]

['43781' '4_1_office' 'functional' 165.0 165.0 0.0 0.0 165.0 165.0 165.0
 1945.0 0.0 1.0 1 0.12239902080783352]
['43781' '4_2_retail' 'functional' 2026.671630859375 506.66790771484375
 35.328369140625 8.83209228515625 2062.0 515.5 506.66790771484375 1931.0
 0.0 4.0 4 0.4895960832313341]
['43781' '4_3_publicServices' 'functional' 14410.0439453125
 1441.00439453125 987.9560546875 98.79560546875 15398.0 1539.8
 1262.90439453125 1935.0 0.0 11.0 10 1.2239902080783354]
['43782' '1_residential' 'functional' 118095.38330078125
 290.13754001010784 3298.61669921875 0.06398079612038353 121394.0
 298.989218328841 255.06187537120798 1925.0 378.0 409.0 391
 58.27123695976155]
['43782' '2_agriculture' 'functional' 43627.0 155.56626506024097 0.0 0.0
 43627.0 155.56626506024097 100.4640522875817 1976.0 4.0 180.0 175
 26.08047690014903]
['43782' '3_industrial' 'functional' 28626.716064453125 258.3222798192224
 121.283935546875 0.0 28748.0 259.7325581395349 175.18838900151636 1944.0
 4.0 99.0 91 13.56184




In [21]:
df.drop(columns=['quantilesBuildingAreaAboveGround','quantilesBuildingAreaBelowGround','quantilesBuildingArea','quantilesDwellingArea','averageDwellingArea','quantilesBuildingUnitArea','quantilesYearConstruction'],inplace=True)

Unnamed: 0,postal_code,ID_CP,ALTA_DB,CODIGO_INE,geometry,centroid,sector,conditionOfConstruction,totalBuiltAreaAboveGround,averageBuildingAreaAboveGround,...,totalBuiltArea,averageBuildingArea,averageBuildingUnitArea,averageYearConstruction,dwellings,buildingUnits,buildings,buildingsShare,latitude,longitude
0,08001,,NaT,,,,1_residential,functional,2.008946e+06,1061.193127,...,2108910.0,1088.961803,217.641722,1896.0,16899.0,21310.0,1682,87.832898,,
1,08001,,NaT,,,,3_industrial,functional,2.447000e+03,815.666667,...,2447.0,815.666667,148.479167,1920.0,12.0,18.0,3,0.156658,,
2,08001,,NaT,,,,4_1_office,functional,1.035661e+05,3884.473586,...,133955.0,4545.952381,1180.464399,1947.0,33.0,542.0,23,1.201044,,
3,08001,,NaT,,,,4_2_retail,functional,2.980151e+05,1918.385967,...,329673.0,2105.085271,1144.906400,1920.0,57.0,376.0,137,7.154047,,
4,08001,,NaT,,,,4_3_publicServices,functional,3.014470e+05,3802.631022,...,329907.0,4191.196970,2864.757974,1951.0,20.0,107.0,70,3.655352,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6102,43897,,NaT,,,,2_agriculture,functional,3.931300e+04,246.567308,...,39313.0,246.567308,157.062500,1979.0,2.0,113.0,110,14.608234,,
6103,43897,,NaT,,,,3_industrial,functional,3.116117e+05,1142.384601,...,312441.0,1145.157233,754.862709,1985.0,0.0,208.0,169,22.443559,,
6104,43897,,NaT,,,,4_1_office,functional,1.997000e+03,499.250000,...,1997.0,499.250000,499.250000,1952.0,0.0,4.0,4,0.531208,,
6105,43897,,NaT,,,,4_2_retail,functional,1.550061e+03,387.515198,...,1998.0,499.500000,387.515198,1982.0,0.0,4.0,4,0.531208,,


In [18]:
df['quantilesDwellingArea'].iloc[1].tolist()


AttributeError: 'NoneType' object has no attribute 'tolist'

In [15]:
df

Unnamed: 0,postal_code,sector,conditionOfConstruction,totalBuiltAreaAboveGround,averageBuildingAreaAboveGround,quantilesBuildingAreaAboveGround,totalBuiltAreaBelowGround,averageBuildingAreaBelowGround,quantilesBuildingAreaBelowGround,totalBuiltArea,...,averageDwellingArea,quantilesDwellingArea,averageBuildingUnitArea,quantilesBuildingUnitArea,averageYearConstruction,quantilesYearConstruction,dwellings,buildingUnits,buildings,buildingsShare
0,08001,1_residential,functional,2.008946e+06,1061.193127,"[341.05, 440.0, 634.25, 895.5, 1325.5, 2038.05...",99963.704346,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 172.74893798827...",2108910.0,...,137.073506,"[33.670184210526315, 38.29048275862068, 49.233...",217.641722,"[43.093030303030304, 49.34, 62.943749999999994...",1896.0,"[1845.0, 1850.0, 1869.0, 1900.0, 1910.0, 1950....",16899.0,21310.0,1682,87.832898
1,08001,3_industrial,functional,2.447000e+03,815.666667,"[126.4, 133.8, 156.0, 193.0, 1164.0, 1746.6000...",0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2447.0,...,,,148.479167,"[120.44375, 121.8875, 126.21875, 133.4375, 163...",1920.0,"[1891.0, 1892.0, 1895.0, 1900.0, 1935.0, 1956....",12.0,18.0,3,0.156658
2,08001,4_1_office,functional,1.035661e+05,3884.473586,"[1041.503564453125, 1103.09052734375, 1763.5, ...",30388.853271,222.137085,"[0.0, 0.0, 0.0, 111.96435546875, 479.320922851...",133955.0,...,,,1180.464399,"[48.1257783190653, 70.05589225589226, 131.2215...",1947.0,"[1872.0, 1900.0, 1920.0, 1955.0, 1992.0, 2000....",33.0,542.0,23,1.201044
3,08001,4_2_retail,functional,2.980151e+05,1918.385967,"[171.58466796875, 468.8, 782.0, 1518.0, 2334.2...",31657.889893,65.568530,"[0.0, 0.0, 0.0, 0.0, 215.664306640625, 602.730...",329673.0,...,,,1144.906400,"[82.22462431936555, 130.86857142857144, 467.0,...",1920.0,"[1850.0, 1862.0, 1900.0, 1910.0, 1936.0, 1990....",57.0,376.0,137,7.154047
4,08001,4_3_publicServices,functional,3.014470e+05,3802.631022,"[616.2, 763.0, 1481.75, 2748.2432861328125, 51...",28459.978516,184.042907,"[0.0, 0.0, 0.0, 0.0, 395.9691162109375, 1752.7...",329907.0,...,,,2864.757974,"[246.76000000000002, 632.4, 1239.6622314453125...",1951.0,"[1887.0, 1890.0, 1930.0, 1955.0, 1989.0, 1996....",20.0,107.0,70,3.655352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6102,43897,2_agriculture,functional,3.931300e+04,246.567308,"[35.0, 39.9, 64.5, 106.5, 280.5, 644.400000000...",0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",39313.0,...,,,157.062500,"[35.0, 39.9, 64.5, 106.5, 268.0, 644.400000000...",1979.0,"[1951.0, 1960.0, 1967.0, 1980.0, 1996.0, 2000....",2.0,113.0,110,14.608234
6103,43897,3_industrial,functional,3.116117e+05,1142.384601,"[44.0, 67.0, 185.0, 598.0, 1663.0, 3252.400000...",829.299561,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",312441.0,...,,,754.862709,"[44.0, 67.0, 166.0, 591.0, 1456.0, 2717.000000...",1985.0,"[1914.0, 1948.0, 1980.0, 1990.0, 2001.0, 2008....",0.0,208.0,169,22.443559
6104,43897,4_1_office,functional,1.997000e+03,499.250000,"[61.0, 82.0, 145.0, 187.5, 541.75, 1165.900000...",0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1997.0,...,,,499.250000,"[61.0, 82.0, 145.0, 187.5, 541.75, 1165.900000...",1952.0,"[1904.0, 1909.0, 1922.0, 1952.0, 1982.0, 1994....",0.0,4.0,4,0.531208
6105,43897,4_2_retail,functional,1.550061e+03,387.515198,"[73.0, 91.0, 145.0, 255.0, 497.51519775390625,...",447.939209,111.984802,"[0.0, 0.0, 0.0, 0.0, 111.98480224609375, 313.5...",1998.0,...,,,387.515198,"[73.0, 91.0, 145.0, 255.0, 497.51519775390625,...",1982.0,"[1972.0, 1974.0, 1979.0, 1984.0, 1988.0, 1989....",0.0,4.0,4,0.531208


In [10]:
upload_data(df,"cadaster")

6107
quantilesDwellingArea


AttributeError: 'NoneType' object has no attribute 'tolist'

In [25]:

from sqlalchemy import create_engine, select, Table, MetaData
db_name = 'postgres'
db_user = 'postgres'
db_password = 'D2st3n1t34n21rth$'
db_host = 'localhost'
db_port = '5432' 
engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

output_table = 'cadaster'

# Write the DataFrame to the SQL database
df.to_sql(output_table, engine, index=False, if_exists='replace')



  df.to_sql(output_table, engine, index=False, if_exists='replace')


AttributeError: 'Engine' object has no attribute 'cursor'

In [2]:
import psycopg2
import yaml
from tqdm import tqdm
import polars as pl
import pandas as pd
with open('/home/eouser/Desktop/DEDL/credentials.yaml', 'r') as f:
    c = yaml.safe_load(f)["postgres"]
    
conn = psycopg2.connect(f"dbname={c['db_name']} user={c['db_user']} password={c['db_password']} host={c['db_host']} port={c['db_port']} sslmode=require")
cursor = conn.cursor()

def adapt_numpy_values(value):
    if isinstance(value, np.generic):
        return value.item()  # Convert scalar to Python native type
    elif isinstance(value, np.ndarray):
        if value.size == 1:
            return value.item()  # Convert single-element array to Python scalar
        else:
            return str(value.tolist())  # Convert multi-element array to string (or handle as needed)
    else:
        return value  # Return original value if not numpy type


dtype_map = {
    'object': 'VARCHAR',
    'int64': 'INTEGER',
    'float64': 'FLOAT',
    'datetime64[ns]': 'TIMESTAMP',
    'datetime64[us]': 'TIMESTAMP'
}

def check_table_exists(table_name):
    query = f"""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'public' AND table_name = '{table_name}'
    """
    cursor.execute(query)
    return cursor.fetchone()[0] 
    

def upload_data(df, table_name):
    if isinstance(df, pl.DataFrame):
        df = df.to_pandas()

            
    if not check_table_exists(table_name):
        columns = []
        for col_name, dtype in df.dtypes.items():
            pg_type = dtype_map.get(str(dtype), 'VARCHAR')
            columns.append(f"{col_name} {pg_type}")
        print(columns)
        create_table_query = f"""
        CREATE TABLE {table_name} (
            {', '.join(columns)},
            PRIMARY KEY (currentUse,postalCode)
        );
        """
        cursor.execute(create_table_query)
        conn.commit()

    insert_query = f"""
    INSERT INTO {table_name} ({','.join(df.columns)})
    VALUES ({','.join(['%s'] * len(df.columns))});
    """
    print(insert_query)
    data = [tuple(adapt_numpy_values(x) for x in row) for row in df.to_numpy()]
    for x in tqdm(data, desc=f"Inserting {table_name} rows", unit="rows"):
        cursor.execute(insert_query,tuple(x))
    conn.commit()

# upload_data(df, "cadaster_data")


# import numpy as np
# data = [tuple(map(lambda x: x if isinstance(x, (np.generic, np.ndarray)) else x, row)) for row in df.to_numpy()]
# if isinstance(df.to_numpy()[0], np.ndarray):
#     print("noo")

In [25]:
import pandas as pd

sql = """
ALTER TABLE unspecified_consumption
DROP PRIMARY KEY;  -- Replace with the correct name

"""

try:
    cursor.execute(sql)
except Exception as e:
    print(f"Error occurred: {e}")
    conn.rollback()  # Rollback the failed transaction
finally:
    cursor.execute(sql)  # Re-run your command to check for the function
    conn.commit()  # Commit if everything goes well
data = cursor.fetchall()
data

Error occurred: syntax error at or near "PRIMARY"
LINE 3: DROP PRIMARY KEY;  -- Replace with the correct name
             ^



SyntaxError: syntax error at or near "PRIMARY"
LINE 3: DROP PRIMARY KEY;  -- Replace with the correct name
             ^
