In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [22]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [23]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [24]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [25]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [26]:
df_train.dropna(inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5080 entries, 31 to 7581
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5080 non-null   int64 
 1   keyword   5080 non-null   object
 2   location  5080 non-null   object
 3   text      5080 non-null   object
 4   target    5080 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 238.1+ KB


In [33]:
fine_tuned_model = SentenceTransformer('all-MiniLM-L6-v2')

text_embeddings = []
for text in tqdm(df_train['text'].tolist(), total=len(df_train)):
    text_embeddings.append(fine_tuned_model.encode(text, convert_to_tensor=False))


df_train['text_embeddings'] = text_embeddings
print(df_train.info())

100%|██████████| 5080/5080 [01:10<00:00, 72.36it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 5080 entries, 31 to 7581
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               5080 non-null   int64 
 1   keyword          5080 non-null   object
 2   location         5080 non-null   object
 3   text             5080 non-null   object
 4   target           5080 non-null   int64 
 5   text_embeddings  5080 non-null   object
dtypes: int64(2), object(4)
memory usage: 277.8+ KB
None





In [34]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,text_embeddings
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,"[-0.05192381, -0.061592996, 0.0127951, -0.0352..."
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,"[-0.07891371, 0.04705478, 0.06574732, 0.037357..."
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,"[-0.008176893, 0.11879565, 0.004264889, 0.0280..."
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,"[-0.02657758, 0.024922693, 0.051392466, -0.035..."
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,"[-0.004596551, 0.023352519, 0.07090306, -0.010..."


In [39]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('df_train.db')
cursor = conn.cursor()

# Create the table with the specified columns
cursor.execute('''
    CREATE TABLE IF NOT EXISTS embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        keyword TEXT,
        location TEXT,
        text TEXT,
        target INTEGER,
        text_embeddings BLOB
    )
''')

conn.commit()

# Insert data into the table
for idx, (keyword, location, text, target, text_emb) in enumerate(zip(df_train['keyword'], df_train['location'], df_train['text'], df_train['target'], text_embeddings)):
    text_emb_np = text_emb
    cursor.execute('''
        INSERT INTO embeddings (keyword, location, text, target, text_embeddings)
        VALUES (?, ?, ?, ?, ?)
    ''', (keyword, location, text, target, text_emb_np.tobytes()))

conn.commit()
conn.close()

In [41]:
# Load the embeddings from the vector database
def load_embeddings(db_path):
    conn = sqlite3.connect(db_path)
    
    query = 'SELECT * FROM embeddings'
    result = conn.execute(query).fetchall()
    
    columns = ['id', 'keyword', 'location', 'text', 'target', 'text_embeddings']
    df = pd.DataFrame(result, columns=columns)
    
    df['text_embeddings'] = df['text_embeddings'].apply(lambda x: np.frombuffer(x, dtype=np.float32))
    
    conn.close()
    
    return df

# Usage
db_path = 'df_train.db'
testing_df = load_embeddings(db_path)
testing_df.head()

Unnamed: 0,id,keyword,location,text,target,text_embeddings
0,1,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,"[-0.05192381, -0.061592996, 0.0127951, -0.0352..."
1,2,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,"[-0.07891371, 0.04705478, 0.06574732, 0.037357..."
2,3,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,"[-0.008176893, 0.11879565, 0.004264889, 0.0280..."
3,4,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,"[-0.02657758, 0.024922693, 0.051392466, -0.035..."
4,5,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,"[-0.004596551, 0.023352519, 0.07090306, -0.010..."
