In [None]:
from datetime import datetime, timezone
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from openai import OpenAI

from intersect.embedding import get_embedding
from intersect.utils import add_you, add_index
from intersect.read_pdf import get_text_from_pdf
from intersect.semantic_search import similarity_search
from intersect.cluster_viz import pca_df, get_chart, add_clusters
from intersect.lexical_search import lexical_search
from intersect.rerank import rerank_cohere
from intersect.tfidf import wordcloud_tfidf, tfidf_words
from intersect.ner import wordcloud_ner, ner_count
from intersect.permutation import permutation_openai

# code repurposed from the web app source code

load_dotenv()


DEFAULT_CV_PATH = "intersect/data/cvs/g.txt"
table_size = 5
n_clusters = 1


def get_current_dbs() -> list[str]:
    return [
        "ai",
        "change",
        "data",
        # "facilitator",
        "fun",
        "law-ai",
        "law",
        "leadership",
    ]


def get_db_filepath(db_name: str) -> str:
    return f"intersect/data/{db_name}.feather"


def open_and_preprocess_db(_db_name):
    original_df = pd.read_feather(get_db_filepath(_db_name))
    original_df = original_df.dropna()
    original_df = original_df.drop_duplicates(subset=["description"])
    original_df["i_relevance"] = original_df.index
    original_df["id"] = original_df.index

    # add days since posted
    original_df["timestamp"] = pd.to_datetime(original_df["posted"], utc=True)
    now = datetime.now(timezone.utc)
    original_df["days_ago"] = (now - original_df["timestamp"]).dt.days  # type: ignore
    return original_df.copy(deep=True)


db_name = get_current_dbs()[0]

df = open_and_preprocess_db(db_name)


location = "london"


def get_input_text(path) -> str:
    with open(path, "r") as f:
        TEXT = f.read()


input_text = get_input_text(DEFAULT_CV_PATH)

print("Jobs found", len(original_df))

### TFDIF ###

wc = tfidf_words(df["description"].tolist())
wcdf = pd.DataFrame(list(wc.items()), columns=["Word", "Frequency"])
wordcloud_tfidf(wc)

### RELEVANCE ###

view_relevance = df[["id", "title", "company", "days_ago", "description", "url"]]
view_relevance.head(table_size)

### SEMANTIC ###

input_embedding = get_embedding(OpenAI(), input_text)
df = similarity_search(df, input_embedding)  # type: ignore
df = add_index(df, "score_semantic", "i_semantic")

view_semantic = df[
    ["id", "i_semantic", "title", "company", "days_ago", "description", "url"]
]

view_semantic.head(table_size)

### SEMANTIC DELTA ###

df["delta_semantic"] = df["i_relevance"] - df["i_semantic"]
df_semantic_delta = df.sort_values("delta_semantic", ascending=False)
view_semantic_delta = df_semantic_delta[
    [
        "id",
        "i_semantic",
        "delta_semantic",
        "title",
        "company",
        "days_ago",
        "description",
        "url",
    ]
]
view_semantic_delta.head(table_size)

### EMBEDDING PCA ###

df_without_you = df.copy()
df_you = add_you(df_without_you, input_text, input_embedding)  # type: ignore
df_pca = pca_df(df_you, "embedding", n_components=2)


def generate_chart(_df: pd.DataFrame, n_clusters: int) -> None:
    _df = add_clusters(df_pca, n_clusters, n_components=2)
    _df.loc[_df["title"] == "Your text", "Cluster"] = " You"
    chart = get_chart(_df)
    st.altair_chart(chart, use_container_width=True)


# generate_chart(df_pca, 1)

### LEXICAL ###

df = lexical_search(input_text, df)
view_lexical = df.sort_values(by="score_lexical", ascending=False)
view_lexical = view_lexical[
    [
        "id",
        "i_lexical",
        "score_lexical",
        "title",
        "company",
        "days_ago",
        "description",
        "url",
    ]
]
view_lexical.head(table_size)

### RERANKER ###

df = rerank_cohere(input_text, df)
df = add_index(df, "score_reranker", new_index="i_reranker")
view_reranked = df.sort_values(by="score_reranker", ascending=False)
view_reranked = view_reranked[
    [
        "id",
        "i_reranker",
        "score_reranker",
        "title",
        "company",
        "days_ago",
        "description",
        "url",
    ]
]
view_reranked.head(table_size)