# Homework 1: Text Mining
## Part 2: Text analysis

Group Members: Matias Borrel, Pol Garcia, Marvin Ernst

#### Importing relevant Libraries:

In [21]:
import os
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /Users/Admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/Admin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#### Define file paths:

In [22]:
files = {
    "Barcelona": {
        "Post": "./Original_data/hotels_barcelona_MWC.csv",
        "Pre": "./Original_data/hotels_barcelona_after_MWC.csv",
    },
    "Madrid": {
        "Post": "./Original_data/hotel_data_Madrid_MWC.csv",
        "Pre": "./Original_data/hotels_madrid_after_MWC.csv",
    },
}

#### Load Data:

In [23]:
dataframes = {city: {key: pd.read_csv(path) for key, path in city_files.items()} for city, city_files in files.items()}

#### Inspect the data frames:

Print column names for each dataframe:

In [24]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        print(f"{city} - {period}:")
        print(df.columns)
        print("\n")

Barcelona - Post:
Index(['Name', 'Price', 'Rating', 'Detail Link', 'Description'], dtype='object')


Barcelona - Pre:
Index(['Name', 'Price', 'Rating', 'Detail Link', 'Description'], dtype='object')


Madrid - Post:
Index(['Name', 'Price', 'Rating', 'Detail Link', 'Description'], dtype='object')


Madrid - Pre:
Index(['Name', 'Price', 'Rating', 'Detail Link', 'Description'], dtype='object')




#### Preparing the Dataframes

We want that pre and post treatment we have the same units (here "hotels") in each of the groups (here "city"). 

Filter Out Rows with Empty Names, or missing values for any of the columns:

In [25]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        # Drop rows with missing values in any column
        dataframes[city][period] = df.dropna(how='any')

Retain Only Names Present in Both Pre and Post:

In [26]:
for city, periods in dataframes.items():
    # Get the intersection of names in Pre and Post datasets
    common_names = set(periods['Pre']['Name']).intersection(set(periods['Post']['Name']))
    
    # Filter rows in both Pre and Post to only include these common names
    dataframes[city]['Pre'] = periods['Pre'][periods['Pre']['Name'].isin(common_names)]
    dataframes[city]['Post'] = periods['Post'][periods['Post']['Name'].isin(common_names)]

Verify Results:

In [27]:
for city, periods in dataframes.items():
    print(f"{city} - Pre dataset shape: {periods['Pre'].shape}")
    print(f"{city} - Post dataset shape: {periods['Post'].shape}")
    print(f"{city} - Common Names: {len(set(periods['Pre']['Name']).intersection(set(periods['Post']['Name'])))}")

Barcelona - Pre dataset shape: (409, 5)
Barcelona - Post dataset shape: (409, 5)
Barcelona - Common Names: 409
Madrid - Pre dataset shape: (536, 5)
Madrid - Post dataset shape: (536, 5)
Madrid - Common Names: 536


Check for duplicates:

In [28]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        # Check for duplicates in the 'Name' column
        duplicate_count = df['Name'].duplicated().sum()  # Count duplicates
        total_names = df['Name'].shape[0]  # Total number of rows
        print(f"{city} - {period}:")
        print(f"Total Names: {total_names}, Duplicates: {duplicate_count}")
        
        if duplicate_count > 0:
            # Display duplicate rows
            duplicates = df[df['Name'].duplicated(keep=False)]
            print("Duplicate Names:")
            print(duplicates[['Name']])
        print("\n")

Barcelona - Post:
Total Names: 409, Duplicates: 0


Barcelona - Pre:
Total Names: 409, Duplicates: 0


Madrid - Post:
Total Names: 536, Duplicates: 0


Madrid - Pre:
Total Names: 536, Duplicates: 0




Since we do not have any dublicates, we don't need to further filter some observations, i.e. the names uniquely identify each hotel.

#### Convert the Price to numeric:

In [29]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        # Remove all non-numeric characters (e.g., € and spaces) and dots (thousands separators)
        df['Price'] = (
            df['Price']
            .astype(str)  # Ensure the column is treated as strings
            .str.replace(r'[^\d]', '', regex=True)  # Remove non-numeric characters
            .astype(int)  # Convert cleaned strings to integers
        )

Verify the changes:

In [30]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        print(f"--- {city} - {period} ---")
        print(df[['Name', 'Price']].head())  # Show the 'Name' and cleaned 'Price' columns
        print("\n")

--- Barcelona - Post ---
                                     Name  Price
0                         Tembo Barcelona   2821
1          Hesperia Barcelona Barri Gòtic   2311
2                           Royal Ramblas   3519
3  Axel TWO Barcelona 4 Sup - Adults Only   2119
4                           TSA La Rambla   2235


--- Barcelona - Pre ---
                             Name  Price
0         chic&basic Habana Hoose   1075
4                    Hotel Market    807
5   Travelodge Barcelona Poblenou    651
8                       Exe Mitre    735
10                  Hotel Condado    696


--- Madrid - Post ---
                             Name  Price
0              BYPILLOW Crosstown    891
1  Only YOU Boutique Hotel Madrid   1818
2             Hostal Flat55Madrid    571
3                     Hostal Adis    869
4   ITC Infantas by Soho Boutique   1173


--- Madrid - Pre ---
                                   Name  Price
0               Hostal New Dream Madrid    576
1                     

##### Deleting the Link column:

In [34]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        # Drop the 'Detail Link' column if it exists
        if 'Detail Link' in df.columns:
            df.drop(columns=['Detail Link'], inplace=True)

#### Convert the Ranking to numeric:

In [35]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        if 'Rating' in df.columns:
            # Extract numeric values, replace commas with dots, and convert to float
            df['Rating'] = (
                df['Rating']
                .astype(str)  # Ensure all values are strings
                .str.extract(r'(\d+,\d+)')  # Extract only the first valid number
                .replace(',', '.', regex=True)  # Replace comma with dot
                .astype(float)  # Convert to float
            )

Verify the changes:

In [37]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        print(f"--- {city} - {period} ---")
        print(df[['Name', 'Rating']].head())  # Show the 'Name' and cleaned 'Price' columns
        print("\n")

--- Barcelona - Post ---
                                     Name  Rating
0                         Tembo Barcelona    8.88
1          Hesperia Barcelona Barri Gòtic    8.78
2                           Royal Ramblas    8.38
3  Axel TWO Barcelona 4 Sup - Adults Only    8.38
4                           TSA La Rambla    9.19


--- Barcelona - Pre ---
                             Name  Rating
0         chic&basic Habana Hoose    8.58
4                    Hotel Market    7.67
5   Travelodge Barcelona Poblenou    7.37
8                       Exe Mitre    8.38
10                  Hotel Condado    8.58


--- Madrid - Post ---
                             Name  Rating
0              BYPILLOW Crosstown    8.78
1  Only YOU Boutique Hotel Madrid    9.39
2             Hostal Flat55Madrid    8.08
3                     Hostal Adis    8.48
4   ITC Infantas by Soho Boutique    8.28


--- Madrid - Pre ---
                                   Name  Rating
0               Hostal New Dream Madrid    7.97
1 

#### Summarize and Compare:

In [38]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        avg_price = round(df['Price'].mean(), 2)  # Round average price to 2 decimal places
        avg_score = round(df['Rating'].mean(), 2)  # Round average score to 2 decimal places
        print(f"{city} - {period}: Avg Price = {avg_price}, Avg Score = {avg_score}")

Barcelona - Post: Avg Price = 2605.51, Avg Score = 8.28
Barcelona - Pre: Avg Price = 1189.0, Avg Score = 8.28
Madrid - Post: Avg Price = 1140.85, Avg Score = 7.82
Madrid - Pre: Avg Price = 1169.32, Avg Score = 7.83


As expected, the rankings are the same, and we see changes in the price.

## (a) Preprocess the Text 

##### Looking at the first few descriptions:

In [None]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        print(f"--- {city} - {period} ---")
        print(df[['Name', 'Description']].head())  
        print("\n")

--- Barcelona - Post ---
                                     Name  \
0                         Tembo Barcelona   
1          Hesperia Barcelona Barri Gòtic   
2                           Royal Ramblas   
3  Axel TWO Barcelona 4 Sup - Adults Only   
4                           TSA La Rambla   

                                         Description  
0  Tembo Barcelona está en Barcelona, a 16 min a ...  
1  Hesperia Barcelona Barri Gòtic es un alojamien...  
2  El Royal Ramblas está ubicado en las famosas R...  
3  El TWO Hotel Barcelona by Axel está situado en...  
4  TSA La Rambla, que cuenta con terraza y wifi g...  


--- Barcelona - Pre ---
                             Name  \
0         chic&basic Habana Hoose   
4                    Hotel Market   
5   Travelodge Barcelona Poblenou   
8                       Exe Mitre   
10                  Hotel Condado   

                                          Description  
0   Chic&basic Habana Hoose es un alojamiento con ...  
4   El Hotel 

##### Check whether descriptions match for both observation times:

In [41]:
pd.set_option('display.max_colwidth', None) # to see the full descriptions

for city, periods in dataframes.items():
    # Merge the Pre and Post dataframes on 'Name'
    merged_df = periods['Pre'].merge(periods['Post'], on='Name', suffixes=('_Pre', '_Post'))
    
    mismatched = merged_df[merged_df['Description_Pre'] != merged_df['Description_Post']]
    
    print(f"--- {city}: Description Differences ---")
    if mismatched.empty:
        print("All descriptions match!")
    else:
        print(mismatched[['Name', 'Description_Pre', 'Description_Post']])
    print("\n")

--- Barcelona: Description Differences ---
                Name  \
227  Sagrada Familia   
238      Casa Lolita   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Description_Pre  \
227                                                                                                         Sagrada Familia se encuentra en Barcelona, a 11 min a pie de Sagrada Fami

#### Normalize and Remove Special Characters:

In [None]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        df['Description'] = df['Description'].str.replace(r'[√©√±]', '', regex=True)

#### Tokenize, Remove Stop Words, and Stem:

In [None]:
stop_words = set(stopwords.words('spanish'))  # Spanish stop words
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

for city, periods in dataframes.items():
    for period, df in periods.items():
        df['Cleaned_Description'] = df['Description'].apply(preprocess_text)

## (b) Create Word Clouds:

In [None]:
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

for city, periods in dataframes.items():
    for period, df in periods.items():
        # Raw text word cloud
        generate_wordcloud(df['Description'], f"{city} - {period} (Raw)")

        # Preprocessed text word cloud
        generate_wordcloud(df['Cleaned_Description'], f"{city} - {period} (Cleaned)")

####  Save Cleaned Data:

In [None]:
for city, periods in dataframes.items():
    for period, df in periods.items():
        df.to_csv(f"{city}_{period}_cleaned.csv", index=False)