### Labb 2 - SQLAlchemy
# Demodata

Ansluter till vår databas:

In [1]:
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from urllib.parse import unquote

server_name   = 'localhost'
database_name = 'labb2'

connection_string = f"DRIVER=ODBC Driver 17 for SQL Server;SERVER={server_name};DATABASE={database_name};Trusted_Connection=yes"
url_string        = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

print('Connecting to database using URL string:')
unquoted_url = unquote(str(url_string))
print(unquoted_url, '\n')

try:    
    engine = create_engine(url_string)
    with engine.connect() as connection:
        print(f'Successfully connected to {database_name}!')
except Exception as e:
    print('Error while connecting to database:\n')
    print(e)

Connecting to database using URL string:
mssql+pyodbc://?odbc_connect=DRIVER=ODBC+Driver+17+for+SQL+Server;SERVER=localhost;DATABASE=labb2;Trusted_Connection=yes 

Successfully connected to labb2!


Rengör Kaggle-data:
 - (https://www.kaggle.com/datasets/uzair01/amazon-books)
 - Tar bort tomma kolumner
 - Tar bort kolumner jag inte vill använda
 - formaterar isbn13 till ett sätt jag vill ha det
 - Randomiserar fram födelsedagar och review_count då review_count hade lustig formatering
 - sätter *ett* språk då alla var på engelska
 - Skapar index
 - Korrigerar datatyper
 - Ersatte NaN med random data i flera kolumner

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/Amazon Books Data.csv')

df.drop(columns=['Unnamed: 12',
                'Unnamed: 13',
                'Unnamed: 14',
                'Unnamed: 15',
                'Unnamed: 16',
                'Unnamed: 17',
                'Unnamed: 18',
                'Unnamed: 19',
                'Unnamed: 20',
                'Unnamed: 21',
                'Unnamed: 22',
                'Unnamed: 23',
                'Unnamed: 24',
                'Unnamed: 25',
                'isbn10',
                'description',
                'edition',
                'best_seller',
                'top_rated '
                ], inplace=True)

df = df[df['isbn13'].notna()]

def reformat_isbn(isbn):
    if '-' in isbn:
        parts = isbn.split('-')
        if len(parts[1]) == 10:
            prefix = parts[0]
            remaining = parts[1]
            new_format = f"{prefix}-{remaining[0]}-{remaining[1:6]}-{remaining[6:9]}-{remaining[9:]}"
            return new_format
    return isbn

df['isbn13'] = df['isbn13'].apply(reformat_isbn)
df['author_ID'] = pd.factorize(df['author'])[0]
df['language'] = 'English'

def random_date(start, end):
    start_u = start.value//10**9
    end_u = end.value//10**9

    random_u = np.random.randint(start_u, end_u)
    return pd.to_datetime(random_u, unit='s').date()

start = pd.to_datetime('1940-01-01')
end = pd.to_datetime('1990-12-31')

df['author_birthdate'] = [random_date(start, end).strftime('%Y-%m-%d') for _ in range(len(df))]

df['rating'] = df['rating'].str.replace('$', '').astype(float)
df['review_count'] = np.random.randint(0, 1000, len(df))
df['price'] = df['price'].str.replace('$', '').astype(float)
df['author_birthdate'] = pd.to_datetime(df['author_birthdate']).dt.date
df['publish_date'] = pd.to_datetime(df['publish_date'], errors='coerce').dt.date
df['publish_date'].replace({pd.NaT: pd.to_datetime('1999-09-09')}, inplace=True)
df['publish_date'] = pd.to_datetime(df['publish_date']).dt.date

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['publish_date'].replace({pd.NaT: pd.to_datetime('1999-09-09')}, inplace=True)


Unnamed: 0,title,author,isbn13,publish_date,rating,review_count,price,author_ID,language,author_birthdate
0,The Staff Engineer's Path,Tanya Reilly,978-1-09811-873-0,2022-10-25,4.7,345,31.99,0,English,1972-09-02
1,Cracking the Coding Interview,Gayle Laakmann McDowell,978-0-98478-285-7,2015-07-01,4.7,20,33.21,1,English,1962-04-05
2,"Python Crash Course, 3rd Edition",Eric Matthes,978-1-71850-270-3,2023-01-10,4.8,615,30.61,2,English,1980-06-05
3,The Pragmatic Programmer,David Thomas,978-0-13595-705-9,2019-09-13,4.8,774,38.0,3,English,1963-09-13
4,Clean Code,Robert C. Martin,978-0-13235-088-4,2008-08-01,4.7,879,40.0,4,English,1947-10-16


In [3]:
df['isbn13'] = df[df['isbn13'].str.match(r'\d+-\d+-\d+-\d+-\d+', na=False)]['isbn13']
df = df.dropna(subset=['isbn13'])

In [4]:
df.drop_duplicates(subset=['isbn13'], inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 419 entries, 0 to 466
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             419 non-null    object 
 1   author            418 non-null    object 
 2   isbn13            419 non-null    object 
 3   publish_date      419 non-null    object 
 4   rating            417 non-null    float64
 5   review_count      419 non-null    int32  
 6   price             418 non-null    float64
 7   author_ID         419 non-null    int64  
 8   language          419 non-null    object 
 9   author_birthdate  419 non-null    object 
dtypes: float64(2), int32(1), int64(1), object(6)
memory usage: 34.4+ KB


Sätter in dataframen i vår sql databas

In [5]:
df.to_sql('RAWbooks', con=engine, if_exists='replace', index=False)

1