In [13]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('../data/public_comments.csv')

In [4]:
df.head()

Unnamed: 0,id,type,attributes_document_type,attributes_last_modified_date,attributes_highlighted_content,attributes_withdrawn,attributes_agency_id,attributes_title,attributes_object_id,attributes_posted_date,links_self,comment
0,VA-2020-VHA-0024-0004,comments,Public Submission,2020-11-12T19:26:52Z,,False,VA,"Comment on AQ94-Interim Final Rule-Batley, Tyler",090000648495a6f1,2020-11-12T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,&quot; Directive 1899 memorializes U.S. Depart...
1,VA-2020-VHA-0024-0005,comments,Public Submission,2020-11-12T19:34:12Z,,False,VA,Comment on AQ94-Interim Final Rule-Anonymous,090000648495accf,2020-11-12T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"As a family physician, I oppose autonomous pra..."
2,VA-2020-VHA-0024-0006,comments,Public Submission,2020-11-13T13:28:22Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495a87a,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,Our nations Veterans have served and sacrifice...
3,VA-2020-VHA-0024-0007,comments,Public Submission,2020-11-13T13:28:22Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495b402,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,This proposed regulation is a danger to health...
4,VA-2020-VHA-0024-0008,comments,Public Submission,2020-11-13T13:28:23Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495b5f8,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,This seems like just common sense. <br/>The VA...


In [16]:
def nlp_clean(str_to_clean: str) -> str:
    """
    cleans strings for use with nlp services

    """
    clean_str = re.sub(
        r"<[^>]+>",
        " ",
        str_to_clean,
    )
    clean_str = re.sub(
        r"&quot;",
        "",
        clean_str,
    )
    clean_str = re.sub(
        r"&#39;",
        "",
        clean_str,
    )
    clean_str = re.sub(
        r"-",
        " ",
        clean_str,
    )
    clean_str = re.sub(
        r"[^a-zA-Z0-9 \.!,]",
        "",
        clean_str,
    )
    string_encode = clean_str.encode("ascii", "ignore")
    string_decode = string_encode.decode()

    return string_decode


def get_cosine_sim(df_full):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df_full["clean_comment"])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim



def get_similar_comments(
    index: int, df: pd.DataFrame, cosine_sim, threshold: float
) -> pd.DataFrame:
    """
    Returns a list of comments that are similiar to the comment passed in.
    """
    new_df = df
    new_df["Similar_Score"] = cosine_sim[index].tolist()
    new_df = new_df.loc[
        new_df["Similar_Score"] > threshold,
    ]

    return new_df

In [8]:
df["clean_comment"] = df["comment"].apply(nlp_clean)

In [9]:
df

Unnamed: 0,id,type,attributes_document_type,attributes_last_modified_date,attributes_highlighted_content,attributes_withdrawn,attributes_agency_id,attributes_title,attributes_object_id,attributes_posted_date,links_self,comment,clean_comment
0,VA-2020-VHA-0024-0004,comments,Public Submission,2020-11-12T19:26:52Z,,False,VA,"Comment on AQ94-Interim Final Rule-Batley, Tyler",090000648495a6f1,2020-11-12T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,&quot; Directive 1899 memorializes U.S. Depart...,Directive 1899 memorializes U.S. Department o...
1,VA-2020-VHA-0024-0005,comments,Public Submission,2020-11-12T19:34:12Z,,False,VA,Comment on AQ94-Interim Final Rule-Anonymous,090000648495accf,2020-11-12T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"As a family physician, I oppose autonomous pra...","As a family physician, I oppose autonomous pra..."
2,VA-2020-VHA-0024-0006,comments,Public Submission,2020-11-13T13:28:22Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495a87a,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,Our nations Veterans have served and sacrifice...,Our nations Veterans have served and sacrifice...
3,VA-2020-VHA-0024-0007,comments,Public Submission,2020-11-13T13:28:22Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495b402,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,This proposed regulation is a danger to health...,This proposed regulation is a danger to health...
4,VA-2020-VHA-0024-0008,comments,Public Submission,2020-11-13T13:28:23Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,090000648495b5f8,2020-11-13T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,This seems like just common sense. <br/>The VA...,This seems like just common sense. The VA sho...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13272,VA-2020-VHA-0024-13329,comments,Public Submission,2021-01-12T04:56:00Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,09000064849eee1a,2021-01-11T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"<br/>Veterans fight for our country, we should...","Veterans fight for our country, we should fig..."
13273,VA-2020-VHA-0024-13330,comments,Public Submission,2021-01-12T04:56:01Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,09000064849eee1b,2021-01-11T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"As a concerned citizen, Im writing today in su...","As a concerned citizen, Im writing today in su..."
13274,VA-2020-VHA-0024-13330,comments,Public Submission,2021-01-12T04:56:01Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,09000064849eee1b,2021-01-11T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"As a concerned citizen, Im writing today in su...","As a concerned citizen, Im writing today in su..."
13275,VA-2020-VHA-0024-13330,comments,Public Submission,2021-01-12T04:56:01Z,,False,VA,Comment on AQ94-Interim Final Rule-Comment Sub...,09000064849eee1b,2021-01-11T05:00:00Z,https://api.regulations.gov/v4/comments/VA-202...,"As a concerned citizen, Im writing today in su...","As a concerned citizen, Im writing today in su..."


In [15]:
vectorizer = TfidfVectorizer()

In [17]:
tfidf_matrix = vectorizer.fit_transform(df["clean_comment"])

In [18]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim = get_cosine_sim(df)

In [None]:
number = 5
limit = 0.95
sim_comments = get_similar_comments(
    index=number, df=df, cosine_sim=cosine_sim, threshold=limit,
)