### Maxine original code for unweighted

In [2]:
import pandas as pd

# Load the total word counts CSV file
total_word_counts = pd.read_csv('total_word_counts_by_city.csv')

# List of important attributes (you can expand this list)
attributes = ['crime', 'cost', 'housing', 'jobs', 'transportation', 'weather']

# Initialize an empty dictionary to store lift data for all attributes
lift_data = {}

# Iterate over each attribute to calculate lift
for attribute in attributes:
    if attribute in total_word_counts['Unnamed: 0'].values:
        # Get total count of the attribute across all cities
        total_frequency_attribute = total_word_counts[total_word_counts['Unnamed: 0'] == attribute].drop(columns='Unnamed: 0').sum().sum()

        # Calculate the total word count for each city (sum across all words)
        total_words_per_city = total_word_counts.drop(columns='Unnamed: 0').sum()

        # Get the observed frequency of the attribute in each city
        observed_frequency_per_city = total_word_counts[total_word_counts['Unnamed: 0'] == attribute].drop(columns='Unnamed: 0').iloc[0]

        # Calculate the expected frequency of the attribute in each city
        expected_frequency_per_city = (total_frequency_attribute / total_words_per_city.sum()) * total_words_per_city

        # Calculate lift for each city
        lift_per_city = observed_frequency_per_city / expected_frequency_per_city

        # Store the results for this attribute
        lift_data[attribute] = pd.DataFrame({
            'City': lift_per_city.index,
            'Observed Frequency': observed_frequency_per_city.values,
            'Expected Frequency': expected_frequency_per_city.values,
            'Lift': lift_per_city.values
        })
    else:
        print(f"The word '{attribute}' was not found in the dataset.")

# Function to display lift data for each attribute
def display_lift_data(lift_data):
    for attribute, df in lift_data.items():
        print(f"\nLift Analysis for '{attribute}':")
        print(df)

# Call the function to display the lift data for each attribute
display_lift_data(lift_data)



Lift Analysis for 'crime':
           City  Observed Frequency  Expected Frequency      Lift
0        austin                  12            8.815534  1.361233
1        dallas                   8            5.024274  1.592270
2       houston                   6            5.467126  1.097469
3       chicago                   0            7.343986  0.000000
4      sandiego                   1            5.670510  0.176351
5       phoenix                   0            9.200210  0.000000
6  philadelphia                   7            2.399398  2.917398
7           nyc                   4            0.837029  4.778806
8    LosAngeles                   8            1.241932  6.441574

Lift Analysis for 'cost':
           City  Observed Frequency  Expected Frequency      Lift
0        austin                  41           43.311103  0.946639
1        dallas                  26           24.684476  1.053294
2       houston                  53           26.860227  1.973178
3       chicago      

### Tweaked unweighted code so output is a lift matrix

In [1]:
import pandas as pd

# Load the total word counts CSV file
total_word_counts = pd.read_csv('total_word_counts_by_city.csv')

# List of important attributes
attributes = ['crime', 'cost', 'housing', 'jobs', 'transportation', 'weather']

# Initialize an empty DataFrame to store the lift matrix
lift_matrix = pd.DataFrame()

# Iterate over each attribute to calculate lift
for attribute in attributes:
    if attribute in total_word_counts['Unnamed: 0'].values:
        # Get total count of the attribute across all cities
        total_frequency_attribute = total_word_counts[total_word_counts['Unnamed: 0'] == attribute].drop(columns='Unnamed: 0').sum().sum()

        # Calculate the total word count for each city (sum across all words)
        total_words_per_city = total_word_counts.drop(columns='Unnamed: 0').sum()

        # Get the observed frequency of the attribute in each city
        observed_frequency_per_city = total_word_counts[total_word_counts['Unnamed: 0'] == attribute].drop(columns='Unnamed: 0').iloc[0]

        # Calculate the expected frequency of the attribute in each city
        expected_frequency_per_city = (total_frequency_attribute / total_words_per_city.sum()) * total_words_per_city

        # Calculate lift for each city
        lift_per_city = observed_frequency_per_city / expected_frequency_per_city

        # Add the lift data for this attribute to the lift matrix
        lift_matrix[attribute] = lift_per_city
    else:
        print(f"The word '{attribute}' was not found in the dataset.")

# uncomment this line to Transpose the lift matrix to have cities as rows and attributes as columns
lift_matrix = lift_matrix.transpose()

# Display the final lift matrix
print("Lift Matrix:")
lift_matrix


Lift Matrix:


Unnamed: 0,austin,dallas,houston,chicago,sandiego,phoenix,philadelphia,nyc,LosAngeles
crime,1.361233,1.59227,1.097469,0.0,0.176351,0.0,2.917398,4.778806,6.441574
cost,0.946639,1.053294,1.973178,0.443443,1.471672,0.884936,0.424148,0.243169,0.491669
housing,1.902418,0.476852,0.262935,0.782954,2.028036,0.312493,1.198217,1.144922,1.15747
jobs,0.703559,0.514357,1.512616,1.126046,1.458363,0.561785,1.292459,1.852458,2.080846
transportation,0.16893,0.230535,0.181596,5.227201,0.233443,0.125896,0.206886,0.988422,0.133234
weather,0.945003,1.081364,1.325028,0.690479,1.213628,1.102337,0.452869,1.298179,0.583292


### Maxine original code for weighted

In [6]:
import pandas as pd

# Load the weighted word counts CSV file
weighted_word_counts = pd.read_csv('weighted_word_counts_by_city.csv')

# List of important attributes (you can expand this list)
attributes = ['crime', 'cost', 'housing', 'jobs', 'transportation', 'weather']

# Initialize an empty dictionary to store lift data for all attributes
lift_data = {}

# Iterate over each attribute to calculate lift
for attribute in attributes:
    if attribute in weighted_word_counts['Word'].values:
        # Get total weighted count of the attribute across all cities
        weighted_frequency_attribute = weighted_word_counts[weighted_word_counts['Word'] == attribute].drop(columns='Word').sum().sum()

        # Calculate the total weighted word count for each city (sum across all words)
        total_weighted_words_per_city = weighted_word_counts.drop(columns='Word').sum()

        # Get the observed weighted frequency of the attribute in each city
        observed_weighted_frequency_per_city = weighted_word_counts[weighted_word_counts['Word'] == attribute].drop(columns='Word').iloc[0]

        # Calculate the expected weighted frequency of the attribute in each city
        expected_weighted_frequency_per_city = (weighted_frequency_attribute / total_weighted_words_per_city.sum()) * total_weighted_words_per_city

        # Calculate lift for each city
        lift_per_city_weighted = observed_weighted_frequency_per_city / expected_weighted_frequency_per_city

        # Store the results for this attribute
        lift_data[attribute] = pd.DataFrame({
            'City': lift_per_city_weighted.index,
            'Observed Frequency': observed_weighted_frequency_per_city.values,
            'Expected Frequency': expected_weighted_frequency_per_city.values,
            'Lift': lift_per_city_weighted.values
        })
    else:
        print(f"The word '{attribute}' was not found in the dataset.")

# Function to display lift data for each attribute
def display_lift_data(lift_data):
    for attribute, df in lift_data.items():
        print(f"\nLift Analysis for '{attribute}':")
        print(df)

# Call the function to display the lift data for each attribute
display_lift_data(lift_data)



Lift Analysis for 'crime':
           City  Observed Frequency  Expected Frequency      Lift
0    LosAngeles                 478          114.306029  4.181757
1        austin                1051          847.075841  1.240739
2       chicago                   0          661.495162  0.000000
3        dallas                 397          288.217784  1.377431
4       houston                 713          294.082591  2.424489
5           nyc                 301          125.972759  2.389405
6  philadelphia                  49          177.675551  0.275784
7       phoenix                   0          544.305049  0.000000
8      sandiego                 348          283.869234  1.225917

Lift Analysis for 'cost':
           City  Observed Frequency  Expected Frequency      Lift
0    LosAngeles                 175          375.733544  0.465756
1        austin                1453         2784.409621  0.521834
2       chicago                1160         2174.390301  0.533483
3        dallas      

### Tweaked weighted code so output is a lift matrix

In [3]:
import pandas as pd

# Load the weighted word counts CSV file
weighted_word_counts = pd.read_csv('weighted_word_counts_by_city.csv')

# List of important attributes
attributes = ['crime', 'cost', 'housing', 'jobs', 'transportation', 'weather']

# Initialize an empty DataFrame to store the lift matrix
lift_matrix_weighted = pd.DataFrame()

# Iterate over each attribute to calculate lift
for attribute in attributes:
    if attribute in weighted_word_counts['Word'].values:
        # Get total weighted count of the attribute across all cities
        weighted_frequency_attribute = weighted_word_counts[weighted_word_counts['Word'] == attribute].drop(columns='Word').sum().sum()

        # Calculate the total weighted word count for each city (sum across all words)
        total_weighted_words_per_city = weighted_word_counts.drop(columns='Word').sum()

        # Get the observed weighted frequency of the attribute in each city
        observed_weighted_frequency_per_city = weighted_word_counts[weighted_word_counts['Word'] == attribute].drop(columns='Word').iloc[0]

        # Calculate the expected weighted frequency of the attribute in each city
        expected_weighted_frequency_per_city = (weighted_frequency_attribute / total_weighted_words_per_city.sum()) * total_weighted_words_per_city

        # Calculate lift for each city
        lift_per_city_weighted = observed_weighted_frequency_per_city / expected_weighted_frequency_per_city

        # Add the lift data for this attribute to the lift matrix
        lift_matrix_weighted[attribute] = lift_per_city_weighted
    else:
        print(f"The word '{attribute}' was not found in the dataset.")

# uncomment this line to Transpose the lift matrix to have cities as rows and attributes as columns
lift_matrix_weighted = lift_matrix_weighted.transpose()

# Display the final lift matrix
print("Weighted Lift Matrix:")
lift_matrix_weighted


Weighted Lift Matrix:


Unnamed: 0,LosAngeles,austin,chicago,dallas,houston,nyc,philadelphia,phoenix,sandiego
crime,4.181757,1.240739,0.0,1.377431,2.424489,2.389405,0.275784,0.0,1.225917
cost,0.465756,0.521834,0.533483,1.736338,2.694806,0.181123,0.535927,0.732739,2.392021
housing,0.867796,1.018888,0.237108,2.194417,0.588115,1.130665,0.901852,0.328656,3.27933
jobs,2.392396,1.636928,0.329024,0.0,1.338172,0.618415,0.479141,1.547075,0.213606
transportation,0.024996,0.028373,4.084488,0.315473,0.029147,0.025349,0.884444,0.577103,0.120781
weather,0.009874,1.390042,0.267624,0.575071,0.067435,0.188144,0.156081,3.07293,0.25275


### Recommender system Code

In [10]:
# Install necessary packages
!pip install sentence-transformers --quiet
!pip install vaderSentiment --quiet



In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Step 1: Load and Prepare the Data
reddit_posts_file = 'reddit_posts_cities.csv'
reddit_df = pd.read_csv(reddit_posts_file)

# List the columns that contain messages
message_columns = ['Title', 'Body', 'Comment 1', 'Comment 2', 'Comment 3', 'Comment 4', 'Comment 5']

# Combine the messages into one column per row
reddit_df['Combined_Messages'] = reddit_df[message_columns].fillna('').apply(lambda row: ' '.join(row), axis=1)

# Ensure 'Subreddit' is treated as a string
reddit_df['Subreddit'] = reddit_df['Subreddit'].astype(str)

# Step 2: Define User Preferences (Attributes)
attributes_of_interest = ['safe', 'rent', 'clean']

# Step 3: Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model suitable for semantic similarity

# Step 4: Create Embeddings for Attributes
attribute_embeddings = model.encode(attributes_of_interest, convert_to_numpy=True)
mean_attribute_embedding = np.mean(attribute_embeddings, axis=0)

# Step 5: Encode Individual Messages and Assign to DataFrame
embeddings = model.encode(
    reddit_df['Combined_Messages'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32
)
reddit_df['Message_Embedding'] = list(embeddings)

# Step 6: Compute Mean Embeddings for Each City
city_embeddings = reddit_df.groupby('Subreddit')['Message_Embedding'].apply(
    lambda embeddings: np.mean(np.vstack(embeddings), axis=0)
).reset_index()
city_embeddings.rename(columns={'Subreddit': 'City'}, inplace=True)

# Step 7: Calculate Similarity Scores
city_embeddings['Similarity'] = city_embeddings['Message_Embedding'].apply(
    lambda emb: util.cos_sim(emb, mean_attribute_embedding).item()
)

# Step 8: Sentiment Analysis on Messages Mentioning Attributes Using VADER
analyzer = SentimentIntensityAnalyzer()

# Function to check if a message mentions any of the attributes
def mentions_attributes(text, attributes):
    tokens = text.lower().split()
    return any(attr.lower() in tokens for attr in attributes)

# Identify messages that mention the attributes
reddit_df['Mentions_Attributes'] = reddit_df['Combined_Messages'].apply(
    lambda text: mentions_attributes(text, attributes_of_interest)
)

# Filter messages that mention the attributes
attribute_mentions_df = reddit_df[reddit_df['Mentions_Attributes']].copy()

# Apply sentiment analysis to messages that mention attributes
attribute_mentions_df['Sentiment_Score'] = attribute_mentions_df['Combined_Messages'].apply(
    lambda text: analyzer.polarity_scores(text)['compound']
)

# Step 9: Compute Average Sentiment Scores for Each City
city_sentiment = attribute_mentions_df.groupby('Subreddit')['Sentiment_Score'].mean().reset_index()
city_sentiment.columns = ['City', 'Average_Sentiment_Score']

# Step 10: Merge DataFrames and Calculate Overall Score
city_texts = city_embeddings.merge(city_sentiment, on='City', how='left')
city_texts['Average_Sentiment_Score'] = city_texts['Average_Sentiment_Score'].fillna(0)
city_texts['Overall_Score'] = city_texts['Similarity'] * city_texts['Average_Sentiment_Score']

# Step 11: Display the Recommendations
city_texts = city_texts.sort_values(by='Overall_Score', ascending=False)
print("\nCity Recommendations Based on Your Preferences:")
recommendations = city_texts[['City', 'Similarity', 'Average_Sentiment_Score', 'Overall_Score']]
print(recommendations.reset_index(drop=True))


### using textblob instead of VADER

In [None]:
# Install necessary packages
!pip install sentence-transformers --quiet
!pip install textblob --quiet
!python -m textblob.download_corpora
!pip install nltk --quiet

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from textblob import TextBlob
import nltk
nltk.download('punkt')

# Step 1: Load and Prepare the Data
reddit_posts_file = 'reddit_posts_cities.csv'
reddit_df = pd.read_csv(reddit_posts_file)

# List the columns that contain messages
message_columns = ['Title', 'Body', 'Comment 1', 'Comment 2', 'Comment 3', 'Comment 4', 'Comment 5']

# Combine the messages into one column per row
reddit_df['Combined_Messages'] = reddit_df[message_columns].fillna('').apply(lambda row: ' '.join(row), axis=1)

# Ensure 'Subreddit' is treated as a string
reddit_df['Subreddit'] = reddit_df['Subreddit'].astype(str)

# Step 2: Define User Preferences (Attributes) with Synonyms
attributes_of_interest = [
    # 'clean', 'cleanliness', 'hygiene',
    # 'rent', 'rental', 'housing', 'expensive', 'affordable',
    # 'safe', 'safety', 'crime', 'dangerous', 'secure', 'unsafe'
    'restaurants', 'transportation', 'cheap'
]

# Step 3: Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model suitable for semantic similarity

# Step 4: Create Embeddings for Attributes
attribute_embeddings = model.encode(attributes_of_interest, convert_to_numpy=True)
mean_attribute_embedding = np.mean(attribute_embeddings, axis=0)

# Step 5: Encode Individual Messages and Assign to DataFrame
embeddings = model.encode(
    reddit_df['Combined_Messages'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32
)
reddit_df['Message_Embedding'] = list(embeddings)

# Step 6: Compute Mean Embeddings for Each City
city_embeddings = reddit_df.groupby('Subreddit')['Message_Embedding'].apply(
    lambda embeddings: np.mean(np.vstack(embeddings), axis=0)
).reset_index()
city_embeddings.rename(columns={'Subreddit': 'City'}, inplace=True)

# Step 7: Calculate Similarity Scores
city_embeddings['Similarity'] = city_embeddings['Message_Embedding'].apply(
    lambda emb: util.cos_sim(emb, mean_attribute_embedding).item()
)

# Step 8: Sentiment Analysis on Messages Mentioning Attributes Using TextBlob

# Function to check if a message mentions any of the attributes
def mentions_attributes(text, attributes):
    tokens = nltk.word_tokenize(text.lower())
    return any(attr.lower() in tokens for attr in attributes)

# Identify messages that mention the attributes
reddit_df['Mentions_Attributes'] = reddit_df['Combined_Messages'].apply(
    lambda text: mentions_attributes(text, attributes_of_interest)
)

# Filter messages that mention the attributes
attribute_mentions_df = reddit_df[reddit_df['Mentions_Attributes']].copy()

# Apply sentiment analysis to messages that mention attributes using TextBlob
def get_sentiment_textblob(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Range from -1 (negative) to 1 (positive)

attribute_mentions_df['Sentiment_Score'] = attribute_mentions_df['Combined_Messages'].apply(get_sentiment_textblob)

# Step 9: Compute Average Sentiment Scores for Each City
city_sentiment = attribute_mentions_df.groupby('Subreddit')['Sentiment_Score'].mean().reset_index()
city_sentiment.columns = ['City', 'Average_Sentiment_Score']

# Step 10: Merge DataFrames and Calculate Overall Score
city_texts = city_embeddings.merge(city_sentiment, on='City', how='left')
city_texts['Average_Sentiment_Score'] = city_texts['Average_Sentiment_Score'].fillna(0)
city_texts['Overall_Score'] = city_texts['Similarity'] * city_texts['Average_Sentiment_Score']

# Step 11: Display the Recommendations
city_texts = city_texts.sort_values(by='Overall_Score', ascending=False)
print("\nCity Recommendations Based on Your Preferences:")
recommendations = city_texts[['City', 'Similarity', 'Average_Sentiment_Score', 'Overall_Score']]
print(recommendations.reset_index(drop=True))


### Determining city to city similarity - with handwavy sentiment adjustment 

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.util import ngrams
from textblob import TextBlob

# ... (Assuming previous steps of data loading and processing have been done)

# Extract city names and embeddings
city_names = city_embeddings['City'].tolist()
city_vectors = np.vstack(city_embeddings['Message_Embedding'].values)

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(city_vectors)

# Adjust similarity matrix based on sentiment
# Merge sentiment scores with city embeddings
city_embeddings_sentiment = city_embeddings.merge(city_sentiment, on='City')

# Extract sentiment scores
sentiment_scores = city_embeddings_sentiment['Average_Sentiment_Score'].values

# Use composite similarity (combining cosine similarity and sentiment similarity)
# Calculate sentiment difference matrix
sentiment_difference_matrix = np.abs(sentiment_scores[:, np.newaxis] - sentiment_scores[np.newaxis, :])

# Normalize sentiment differences to range [0, 1]
max_sentiment_diff = np.max(sentiment_difference_matrix)
sentiment_similarity_matrix = 1 - (sentiment_difference_matrix / max_sentiment_diff)

# Combine with cosine similarity matrix
composite_similarity_matrix = similarity_matrix * sentiment_similarity_matrix

# Create a DataFrame for the composite similarities
composite_similarity_df = pd.DataFrame(composite_similarity_matrix, index=city_names, columns=city_names)

# Convert composite similarity matrix to dissimilarity matrix
dissimilarity_matrix = 1 - composite_similarity_matrix

# Initialize MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

# Fit and transform the data
mds_results = mds.fit_transform(dissimilarity_matrix)

# Create a DataFrame for MDS results
mds_df = pd.DataFrame(mds_results, columns=['Dimension 1', 'Dimension 2'])
mds_df['City'] = city_names

# Plot the MDS results
plt.figure(figsize=(10, 8))
plt.scatter(mds_df['Dimension 1'], mds_df['Dimension 2'])

# Annotate points with city names
for i, city in enumerate(mds_df['City']):
    plt.annotate(city, (mds_df.loc[i, 'Dimension 1'], mds_df.loc[i, 'Dimension 2']))

plt.title('MDS Plot of City Similarities (Adjusted for Sentiment)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()


### Determining city to city similarity - without any sentiment adjustment 

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

# ... (Assuming previous steps of data loading and processing have been done)

# Step 5: Extract City Names and Embeddings
city_names = city_embeddings['City'].tolist()
city_vectors = np.vstack(city_embeddings['Message_Embedding'].values)

# Step 6: Calculate the Cosine Similarity Matrix Between Cities
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarities
similarity_matrix = cosine_similarity(city_vectors)

# Create a DataFrame for better visualization
similarity_df = pd.DataFrame(similarity_matrix, index=city_names, columns=city_names)

# Display the similarity matrix
print("Pairwise Cosine Similarities Between Cities:")
print(similarity_df)

# Step 7: Visualize Similarities Using MDS (Without Sentiment Analysis)
from sklearn.manifold import MDS

# Convert similarity matrix to dissimilarity matrix (distance matrix)
# Since cosine similarity ranges from -1 to 1, and MDS requires distances, we can use 1 - cosine similarity
dissimilarity_matrix = 1 - similarity_df

# Initialize MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

# Fit and transform the data
mds_results = mds.fit_transform(dissimilarity_matrix)

# Create a DataFrame for MDS results
mds_df = pd.DataFrame(mds_results, columns=['Dimension 1', 'Dimension 2'])
mds_df['City'] = city_names

# Plot the MDS results
plt.figure(figsize=(10, 8))
plt.scatter(mds_df['Dimension 1'], mds_df['Dimension 2'])

# Annotate points with city names
for i, city in enumerate(mds_df['City']):
    plt.annotate(city, (mds_df.loc[i, 'Dimension 1'], mds_df.loc[i, 'Dimension 2']))

plt.title('MDS Plot of City Similarities (Without Sentiment Analysis)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()
