## Data flow to mysql

In [None]:
import mysql.connector as mdb
from mysql.connector import Error
import sys, traceback

In [None]:
def open_database(host, db_name, username, password):
    try:
        connection = mdb.connect(host=host,
                                 database=db_name,
                                 user=username,
                                 password=password)
        if connection.is_connected():
           db_Info = connection.get_server_info()
           print("Connected to MySQL database... MySQL Server version on ",db_Info)
           cursor = connection.cursor()
           cursor.execute("select database();")
           record = cursor.fetchone()
           print ("Your connected to - ", record)
           cursor.close()
    except Error as e :
        print ("Error while connecting to MySQL", e)
    return connection

In [None]:
def close_database(connection):
    #closing database connection.
    if(connection.is_connected()):
        connection.close()
        print("MySQL connection is closed")

In [None]:
conn = open_database('localhost', 'caselaw', 'root', 'H3rnandez!')

In [None]:
query = """SELECT * FROM caselaw.case LIMIT 1;"""
cursor = conn.cursor(buffered=True)
cursor.execute(query)
records = cursor.fetchall()
for row in records:
    print(row)

---
### Datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
!ls ../data_dump

In [None]:
df_case = pd.read_csv("../data_dump/case.csv").replace(np.nan, 'NULL', regex=True)
df_country = pd.read_csv("../data_dump/countries.csv").replace(np.nan, 'NULL', regex=True)
df_li_cases = pd.read_csv("../data_dump/legal_intelligence_cases.csv").replace(np.nan, 'NULL', regex=True).drop_duplicates(subset ="ecli")
df_case_opinion = pd.read_csv("../data_dump/case_opinion_from_advocate_general.csv").replace(np.nan, 'NULL', regex=True)
df_case_citation = pd.read_csv("../data_dump/caselaw_citations.csv").replace(np.nan, 'NULL', regex=True)
df_legislation_citation = pd.read_csv("../data_dump/legislation_citations.csv").replace(np.nan, 'NULL', regex=True)

In [None]:
#change LI Ecli to not underscores
for i, li_case in df_li_cases.iterrows():
    #get the ecli number
    ecli = li_case.ecli
    #replace the underscores
    new_ecli = ecli.replace('_', ':')
    #save the new ecli in the dataframe
    df_li_cases.at[i, 'ecli'] = new_ecli

---
### Sample to run the procedure

In [None]:
sample_size = 50000
#list of ecli numbers we choose for our sample
sample = list(df_case['case_id'].sample(n=sample_size, random_state=18))

In [None]:
#compare case opinion and cases on ecli cause it seems like there is no overlap 
#case_eclis = df_case['case_id'].values
#opinion_eclis = df_case_opinion['case_id'].values

In [None]:
##0: case sample
df_case = df_case[df_case['case_id'].isin(sample)]

##1: legislation citations sample
df_legislation_citation = df_legislation_citation[df_legislation_citation['source_ecli'].isin(sample)]

##2: case citations sample
df_case_citation = df_case_citation[df_case_citation['source_ecli'].isin(sample)]

##3: case opinions sample
df_case_opinion = df_case_opinion[df_case_opinion['case_id'].isin(sample)]

##4: li cases sample
df_li_cases = df_li_cases[df_li_cases['ecli'].isin(sample)]

print(len(df_case), len(df_legislation_citation), len(df_case_citation), len(df_case_opinion), len(df_li_cases))

---
### Utils

In [None]:
def to_tuples(df):
    return [tuple(x) for x in df.values]

In [None]:
def clean_table_sql(table_name):
    try:
        cursor1 = conn.cursor(buffered=True)
        query1 = """DELETE FROM `{}`;""".format(table_name)
        cursor1.execute(query1)
        cursor2 = conn.cursor(buffered=True)
        query2 = """ALTER TABLE `{}` AUTO_INCREMENT = 1;""".format(table_name)
        cursor2.execute(query2)
        conn.commit()
    except Error as error :
        conn.rollback()
        print("Failed to delete MySQL table {}".format(error))
        traceback.print_exc(file=sys.stdout)

In [None]:
def get_parent_ids(table, column_table, df, column_df):    
    """DB table, DB column_table, df: pandas df to look at, column_df"""
    pid = []
    cursor = conn.cursor()
    for idx, data in enumerate(df[column_df]):
        if "'" in data:
            query = """SELECT id FROM `{}` WHERE {} = \"{}\" """.format(table, column_table, data)
        else:
            query = """SELECT id FROM `{}` WHERE {} = '{}' """.format(table, column_table, data)
        #print(data)
        cursor.execute(query)
        records = cursor.fetchall()
        if(len(records) == 0):
            print('no records')
        pid.append(records[0][0])
    cursor.close()
    return pid

---
### Tables Hierarchies

1. Court
2. Case
3. Case Opinion
4. LI Case
5. Subject
6. Case subject
7. Country
8. Case country
9. Case citation
10. Legislation citation
11. Case related decision


In [None]:
clean_table_sql('case_related_decision')
clean_table_sql('legislation_citation')
clean_table_sql('case_citation')
clean_table_sql('case_country')
clean_table_sql('country')
clean_table_sql('case_subject')
clean_table_sql('subject')
clean_table_sql('legal_intelligence_case')
clean_table_sql('case_opinion')
clean_table_sql('case')
clean_table_sql('court')

---
### Courts

In [None]:
court = pd.DataFrame()

In [None]:
courts_list = df_case.authority.unique()
court['name'] = courts_list

In [None]:
#dummies
court.loc[:,'type'] = 'NULL'
court.loc[:,'level'] = 'NULL'
court.loc[:,'country'] = 'NULL'
court.loc[:,'language'] = 'NULL' 
court.loc[:,'jurisdiction'] = 'NULL' 
court.loc[:,'law_area'] = 'NULL'
court.loc[:,'authority_level'] = 'NULL'

In [None]:
court.head(2)

In [None]:
tuples = to_tuples(court)

In [None]:
#(\"%s\", %s, %s, %s, %s, %s, %s, %s)")
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `court`"
                "(name, type, level, country, language, jurisdiction, law_area, authority_level)"
                "VALUES (\"%s\", %s, %s, %s, %s, %s, %s, %s)")%data
        #print(query)
        cursor.execute(query)
    conn.commit()
    cursor.close()
    print('court added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

In [None]:
len(court)

---
### Case

In [None]:
case = pd.DataFrame()
case_columns = ['date',
        'description',
        'language',
        'venue',
        'abstract',
        'procedure_type',
        'lodge_date',
        'link',
        'ecli']

In [None]:
temp = df_case[['date',
                'description',
                'language',
                'venue',
                'abstract',
                'procedure_type',
                'lodge_date',
                'alternative_sources',
                'case_id']]
temp.columns = case_columns
case = case.append(temp, ignore_index=True)

In [None]:
parents_ids = get_parent_ids('court', 'name', df_case, 'authority')

In [None]:
case['name'] = 'NULL'
case['court_id'] = parents_ids
case['date'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in case['date']]
case['lodge_date'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in case['lodge_date']]

In [None]:
case.head()

In [None]:
tuples = to_tuples(case)

In [None]:
try:  
    cursor = conn.cursor(buffered=True)
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case`"
                "(date, description, language, venue, abstract, procedure_type, lodge_date, link, ecli, name, court_id)"
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('case added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Case opinion advocate general

In [None]:
case_opinion = pd.DataFrame()
case_opinion_columns = ['date',
                        'case_number',
                        'description',
                        'language',
                        'country',
                        'venue',
                        'abstract',
                        'procedure_type',
                        'authority',
                        'ecli']

In [None]:
temp = df_case_opinion[['date',
                        'case_number',
                        'description',
                        'language',
                        'country',
                        'venue',
                        'abstract',
                        'procedure_type',
                        'authority',
                        'case_id']]
temp.columns = case_opinion_columns
case_opinion = case_opinion.append(temp, ignore_index=True)

In [None]:
case_opinion['date'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in case_opinion['date']]

In [None]:
case_opinion.head(2)

In [None]:
tuples = to_tuples(case_opinion)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case_opinion`"
                "(date, case_number, description, language, country, venue, abstract, procedure_type, authority, ecli)"
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('case_opinion added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Legal Intelligence Cases

In [None]:
legal_intelligence_case = pd.DataFrame()
legal_intelligence_case_columns = ['ecli',
                                   'name',
                                   'date',
                                   'abstract',
                                   'subject',
                                   'link',
                                   'DisplayTitle',
                                   'OriginalUrl',
                                   'Jurisidiction',
                                   'DocumentType',
                                   'CaseNumber',
                                   'PublicationNumber',
                                   'IssueNumber',
                                   'lodge_date',
                                   'DateAdded',
                                   'Sources',
                                   'UrlWithAutoLogOnToken',
                                   'court',
                                   'DisplaySubtitle']

In [None]:
temp = df_li_cases[['ecli',
                    'Title',
                    'date',
                    'abstract',
                    'LawArea',
                    'Url',
                    'DisplayTitle',
                    'OriginalUrl',
                    'Jurisdiction',
                    'DocumentType',
                    'case_number',
                    'PublicationNumber',
                    'IssueNumber',
                    'lodge_date',
                    'DateAdded',
                    'Sources',
                    'UrlWithAutoLogOnToken',
                    'authority',
                    'DisplaySubtitle']]
temp.columns = legal_intelligence_case_columns
legal_intelligence_case = legal_intelligence_case.append(temp, ignore_index=True)

In [None]:
legal_intelligence_case.head(2)

In [None]:
#legal_intelligence_case['name'] = [i[0:250] for i in legal_intelligence_case['name']]
#legal_intelligence_case['DisplayTitle'] = [i[0:250] for i in legal_intelligence_case['DisplayTitle']]
legal_intelligence_case['date'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in df_li_cases['date']]
legal_intelligence_case['lodge_date'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in df_li_cases['lodge_date']]
legal_intelligence_case['DateAdded'] = [pd.to_datetime(i, errors='coerce') if i != 'NULL' else pd.to_datetime('1900-01-01 00:00:00') for i in df_li_cases['DateAdded']]

In [None]:
tuples = to_tuples(legal_intelligence_case)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `legal_intelligence_case`"
                "(ecli, name, date, abstract, subject, link, DisplayTitle, OriginalUrl, Jurisidiction, DocumentType, CaseNumber, PublicationNumber, IssueNumber, lodge_date, DateAdded, Sources, UrlWithAutoLogOnToken, court, DisplaySubtitle)"
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('legal intelligence added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Subjects

In [None]:
subjects_as_list = [list(row.split("; ")) for row in df_case.subject]
que_subjects = \
    set(list(
        pd.core.common\
            .flatten(subjects_as_list)))

In [None]:
subject = pd.DataFrame()
subject['name'] = list(sorted(unique_subjects))
subject.loc[:,'standard_name'] = 'NULL'

In [None]:
subject.head(2)

In [None]:
#clean_table_sql('subject')

In [None]:
tuples = to_tuples(subject)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `subject`"
                "(name, standard_name)"
                "VALUES (%s, %s)")
        #print(idx)
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('subject added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Case - Subject

In [None]:
df_subjects_case = df_case[['subject','case_id']]
df_subjects_case.loc[:,'subject'] = subjects_as_list
df_subjects_case = df_subjects_case.explode('subject')
len(df_subjects_case)

In [None]:
parents_ids_subjects = get_parent_ids('subject', 'name', df_subjects_case, 'subject')
len(parents_ids_subjects 

In [None]:
parents_ids_cases = get_parent_ids('case', 'ecli', df_subjects_case, 'case_id')
len(parents_ids_cases)

In [None]:
case_subject = pd.DataFrame({'case_id':parents_ids_cases,
                             'subject_id':parents_ids_subjects})

In [None]:
#case_subject.sort_values(by='case_id').tail(50) #quality check
#[(type(i), type(j)) for i,j in zip(case_subject.subject_id,case_subject.case_id)]

In [None]:
case_subject.tail(3)

In [None]:
tuples = to_tuples(case_subject)
tuples = [(int(i), int(j)) for i,j in tuples]

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case_subject`"
                "(case_id, subject_id)"
                "VALUES (%s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('case_subject added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Countries

In [None]:
df_country.head()

In [None]:
df_country.loc[:,'language'] = 'NULL'
df_country.loc[:,'eea'] = 0

In [None]:
country = pd.DataFrame()
country_columns = ['id',
                   'name',
                   'language',
                   'flag',
                   'eu',
                   'eea']

In [None]:
temp = df_country[['country_id',
                   'name',
                   'language',
                   'flag',
                   'eu',
                   'eea']]
temp.columns = country_columns
country = country.append(temp, ignore_index=True)

In [None]:
country.head(3)

In [None]:
tuples = to_tuples(country)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `country`"
                "(id, name, language, flag, eu, eea)"
                "VALUES (%s, %s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('country added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Case - Country

In [None]:
df_case.head(2)

In [None]:
#as_list = [list(row.split("; ")) for row in df_case.country] #in case there is more than one country in the row

In [None]:
df_country_case = df_case[['case_id','country']]
#df_country_case.loc[:,'country'] = as_list
df_country_case = df_country_case.explode('country')
len(df_country_case)

In [None]:
parents_ids_countries = get_parent_ids('country', 'id', df_country_case, 'country')
len(parents_ids_countries)

In [None]:
parents_ids_cases = get_parent_ids('case', 'ecli', df_country_case, 'case_id')
len(parents_ids_cases)

In [None]:
case_country = pd.DataFrame({'case_id':parents_ids_cases,
                             'country_id':parents_ids_countries})

In [None]:
case_country.tail(3)

In [None]:
tuples = to_tuples(case_country)
#tuples = [(int(i), int(j)) for i,j in tuples]

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case_country`"
                "(case_id, country_id)"
                "VALUES (%s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('case_subject added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Case law citation

In [None]:
df_case_citation.head(4)

In [None]:
parent_ids = get_parent_ids('case', 'ecli', df_case_citation, 'source_ecli')
df_case_citation['case_id'] = parent_ids

In [None]:
case_citation = pd.DataFrame()
case_citation_columns = ['source_ecli',
                         'source_paragraph',
                         'target_ecli',
                         'target_paragraph',
                        'case_id']

In [None]:
temp = df_case_citation[['source_ecli',
                         'source_paragraph',
                         'target_ecli',
                         'target_paragraph',
                         'case_id']]
temp.columns = case_citation_columns
case_citation = case_citation.append(temp, ignore_index=True)

In [None]:
case_citation.head(3)

In [None]:
tuples = to_tuples(case_citation)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case_citation`"
                "(source_ecli, source_paragraph, target_ecli, target_paragraph, case_id)"
                "VALUES (%s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('case_citation added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Legislation citation

In [None]:
df_legislation_citation.head(4)

In [None]:
parent_ids = get_parent_ids('case', 'ecli', df_legislation_citation, 'source_ecli')
df_legislation_citation['case_id'] = parent_ids

In [None]:
df_legislation_citation.loc[:,'target_name'] = 'NULL'
df_legislation_citation.loc[:,'target_sourcename'] = 'NULL'

In [None]:
legislation_citation = pd.DataFrame()
legislation_citation_columns = ['source_ecli',
                                'source_paragraph',
                                'target_article',
                                'target_paragraph',
                                'target_name',
                                'target_sourcename',
                                'target_link'
                                'case_id']

In [None]:
temp = df_legislation_citation[['source_ecli',
                                'source_paragraph',
                                'target_article',
                                'target_article_paragraph',
                                'target_name',
                                'target_sourcename',
                                'target_article_webpage'
                                'case_id']]
temp.columns = legislation_citation_columns
legislation_citation = legislation_citation.append(temp, ignore_index=True)

In [None]:
legislation_citation.head(3)

In [None]:
tuples = to_tuples(legislation_citation)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `legislation_citation`"
                "(source_ecli, source_paragraph, target_article, target_paragraph, target_name, target_sourcename, target_link, case_id)"
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('legislation_citation added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)

---
### Case related decision

In [None]:
df_case_related = df_case[['case_id','related_cases']]\
    .rename(columns = {'case_id':'source_ecli', 
                       'related_cases': 'referencing_case_ecli'})

In [None]:
parent_ids = get_parent_ids('case', 'ecli', df_case_related, 'source_ecli')
df_case_related['case_id'] = parent_ids

In [None]:
df_case_related.loc[:,'referenced_case_ecli'] = 'NULL'

In [None]:
df_case_related.head(2)

In [None]:
case_related_decision = pd.DataFrame()
case_related_decision_columns = ['source_ecli',
                                 'referencing_case_ecli',
                                'referenced_case_ecli',
                                'case_id']

In [None]:
temp = df_case_related[['source_ecli',
                                'referencing_case_ecli',
                                'referenced_case_ecli',
                                'case_id']]
temp.columns = case_related_decision_columns
case_related_decision = case_related_decision.append(temp, ignore_index=True)

In [None]:
case_related_decision.head(3)

In [None]:
tuples = to_tuples(case_related_decision)

In [None]:
try:  
    cursor = conn.cursor()
    for idx, data in enumerate(tuples):
        query = ("INSERT INTO `case_related_decision`"
                "(source_ecli, referencing_case_ecli, referenced_case_ecli, case_id)"
                "VALUES (%s, %s, %s, %s)")
        cursor.execute(query, data)
    conn.commit()
    cursor.close()
    print('legislation_citation added')
except Error as error :
    conn.rollback()
    print("Failed to insert into MySQL table {}".format(error))
    traceback.print_exc(file=sys.stdout)