In [1]:
import string,re
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from langdetect import detect as detect_language



In [4]:
#https://insideairbnb.com/get-the-data/
#https://github.com/pemistahl/lingua-py
listings = pd.read_csv("files/input/listings.csv")
listings.columns

reviews = pd.read_csv("files/input/reviews.csv")
reviews.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')

In [5]:
#REMOVE NA Reviews
reviews = reviews[~reviews["comments"].isna()]

#REMOVE LOW REVIEW COUNTS
reviews["review_counts"] = reviews["listing_id"].map(
    reviews["listing_id"].value_counts()
)
reviews = reviews[reviews["review_counts"] > 5]

#REMOVE DOUBLE QUOTES
reviews["comments"] = reviews["comments"].apply(lambda x: x.replace('"', ''))

#REMOVE MARKUP
def remove_markup(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

reviews["comments"] = reviews["comments"].apply(lambda x: remove_markup(x))

#FIND WORD COUNT AND REMOVE LOW WORD COUNT COMMENTS
reviews["word_count"] = reviews["comments"].apply(lambda x: len(x.split(" ")))
reviews = reviews[reviews["word_count"] > 5]






In [None]:

#This will be commented out because it takes 30 mins to run. I've saved a version of it at reviews_cleaned so we can pick up from after the language filter runs
def check_language(s):
    try:
        return detect_language(s)
    except:
        return "error"

reviews["language"] = reviews["comments"].apply(lambda x: check_language(x))
reviews.to_csv("files/input/reviews_cleaned.csv")
reviews = reviews[reviews["language"] == "en"]
reviews = reviews[["listing_id", "id", "date", "reviewer_id", "comments", "word_count"]]
reviews.to_csv("files/input/reviews_cleaned.csv", index=False)


In [10]:
rc = pd.read_csv("files/input/reviews_cleaned.csv", engine="python")


In [13]:
len(rc)

774151

In [None]:
#REMOVE STOP WORDS
reviews["comments"] = reviews["comments"].apply(
    lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in STOPWORDS])
)

In [24]:
reviews["review_counts"] = reviews["listing_id"].map(
    reviews["listing_id"].value_counts()
)
print(len(reviews["listing_id"].unique()))
print(len(reviews[reviews["review_counts"] > 5]["listing_id"].unique()))

25647
16232


In [None]:
reviews = pd.read_csv("files/input/reviews_cleaned.csv", engine="python")


792584


21028

In [17]:
reviews["listing_id"].isna().sum()

2139

In [16]:
print(len(reviews))
l = set(listings["id"].to_list())
r = set(reviews["listing_id"].to_list())
no_review_ids = list(l.difference(r))
len(no_review_ids)
#REMOVE NON ALPHA SUBMISSIONS FROM comments?
reviews["listing_id"]  = reviews["listing_id"].astype(int)
reviews["reviewer_id"]  = reviews["reviewer_id"].astype(int)
reviews = reviews[["listing_id", "id", "date", "reviewer_id", "comments"]]
reviews.to_csv("files/input/reviews_cleaned.csv", index=False)




792584


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [13]:
reviews.head(10)

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,word_count,language
0,2,2595.0,19760,2009-12-10,38960.0,Anita,I've stayed with my friend at the Midtown Cast...,89.0,en
1,3,2595.0,34320,2010-04-09,71130.0,Kai-Uwe,"We've been staying here for about 9 nights, en...",66.0,en
2,4,2595.0,46312,2010-05-25,117113.0,Alicia,We had a wonderful stay at Jennifer's charming...,24.0,en
3,5,2595.0,1238204,2012-05-07,1783688.0,Sergey,Hi to everyone!\rWould say our greatest compli...,97.0,en
4,6,2595.0,1293632,2012-05-17,1870771.0,Loïc,"Jennifer was very friendly and helpful, and he...",35.0,en
5,7,2595.0,2022498,2012-08-18,2124102.0,Melanie,This apartment is like a real castle old and u...,211.0,en
6,8,2595.0,4682989,2013-05-20,496053.0,Eric,Jennifer's place was in a great midtown locati...,54.0,en
7,9,2595.0,13193832,2014-05-21,13685934.0,Gerald,Jennifer is a very nice host. Everything is cl...,25.0,en
8,10,2595.0,15515108,2014-07-10,10781357.0,Richard,This is a cute studio in a wonderful location ...,50.0,en
9,11,2595.0,20372242,2014-09-28,4212558.0,Carson,"A great location, a very comfortable space and...",16.0,en


### Problem



In [24]:
#Data Source: https://www.consumerfinance.gov/data-research/consumer-complaints/search/
complaints = pd.read_csv("files/input/complaints-2025-04-15_18_25.csv")
complaints.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/28/25,Student loan,Federal student loan servicing,Struggling to repay your loan,Bankruptcy,,,"Nelnet, Inc.",CA,90715,,,Web,03/28/25,Closed with explanation,Yes,,12703200
1,09/18/24,Student loan,Federal student loan servicing,Dealing with your lender or servicer,Received bad information about your loan,Navient managed my student loans for many year...,,"Navient Solutions, LLC.",MI,49505,,Consent provided,Web,11/26/24,Closed with explanation,Yes,,10159869
2,01/02/25,Student loan,Federal student loan servicing,Dealing with your lender or servicer,Problem with customer service,,Company believes it acted appropriately as aut...,MOHELA,OR,97209,,Consent not provided,Web,01/02/25,Closed with explanation,Yes,,11355387
3,12/13/24,Student loan,Federal student loan servicing,Dealing with your lender or servicer,Received bad information about your loan,Here is the problem Following an update to the...,,"Maximus Federal Services, Inc.",NY,11725,,Consent provided,Web,01/06/25,Closed with explanation,Yes,,11145010
4,06/01/12,Student loan,Non-federal student loan,Problems when you are unable to pay,,,,"Navient Solutions, LLC.",OK,73099,,,Web,06/04/12,Closed with explanation,Yes,No,90522


In [26]:
complaints["Sub-product"].value_counts()
complaints["Company"].value_counts()
complaints["Company response to consumer"].value_counts()

Company response to consumer
Closed with explanation            92502
Closed with non-monetary relief     6336
Closed with monetary relief         2324
In progress                         1651
Untimely response                   1320
Closed without relief                648
Closed                               216
Closed with relief                    75
Name: count, dtype: int64

In [9]:
complaints["Issue"].value_counts()

Issue
Dealing with your lender or servicer                                                49722
Dealing with my lender or servicer                                                  17622
Struggling to repay your loan                                                       14483
Can't repay my loan                                                                  8725
Repaying your loan                                                                   3820
Incorrect information on your report                                                 3720
Getting a loan                                                                       2180
Problems when you are unable to pay                                                  1697
Improper use of your report                                                          1228
Problem with a credit reporting company's investigation into an existing problem     1065
Problem with a company's investigation into an existing problem                       302
Cred

In [None]:
def simplify_categories(s):
    return {
        "Dealing with your lender or servicer": "lender_or_servicer",
        "Struggling to repay your loan": "repayment_problems",
        "Problem with a company's investigation into an existing problem": "lender_or_servicer",
        "Dealing with my lender or servicer": "lender_or_servicer",
        "Incorrect information on your report": "report_problems",
        "Can't repay my loan": "",
        "Improper use of your report": "report_problems",
        "Getting a loan": "",
        "Credit monitoring or identity theft protection services": "",
        "Issue with income share agreement": "",
        "Issue where my lender is my school": "",
        "Problem with fraud alerts or security freezes": "",
        "Unable to get your credit report or credit score": "report_problems",
        "Problem with a credit reporting company's investigation into an existing problem": ""
    }[s]

"Dealing with your lender or servicer": "",
"Struggling to repay your loan": "",
"Problem with a company's investigation into an existing problem": "",
"Dealing with my lender or servicer": "",
"Incorrect information on your report": "",
"Can't repay my loan": "",
"Improper use of your report": "",
"Getting a loan": "",
"Credit monitoring or identity theft protection services": "",
"Issue with income share agreement": "",
"Issue where my lender is my school": "",
"Problem with fraud alerts or security freezes": "",
"Unable to get your credit report or credit score": "",
"Problem with a credit reporting company's investigation into an existing problem": "",


In [19]:
print(sum(~complaints["Consumer complaint narrative"].isna()))
complaints = complaints[~complaints["Consumer complaint narrative"].isna()]
complaints["Consumer complaint narrative"].head()


49954


1     Navient managed my student loans for many year...
3     Here is the problem Following an update to the...
7     I have ask Navient for information about my lo...
9     Mohela autopayment issues.\n\nAutopay will not...
10    I consolidated my federal student loans in XX/...
Name: Consumer complaint narrative, dtype: object

In [20]:
complaints["Customer complaint length"] = complaints["Consumer complaint narrative"].apply(lambda x: len(x.split(" ")))
complaints["Customer complaint length"].median()

170.0