In [19]:
# ---- Imports ----
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from rake_nltk import Rake

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/michaelscutari/mic
[nltk_data]     romamba/envs/cs671/lib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/michaelscutari/micro
[nltk_data]     mamba/envs/cs671/lib/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/michaelscutari/mic
[nltk_data]     romamba/envs/cs671/lib/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [24]:
# ---- Import Data ----
data = pd.read_csv('../data/train.csv')
descriptions = data['description']

print(descriptions.head())

0    Welcome to Bed-Stuy, Brooklyn! Our newly renov...
1    Lovely nonsmoking annex in Brooklyn's "secret ...
2    This studio presents unparalleled convenience ...
3    - Furnished room in a newly renovated apartmen...
4    This modern property in Manhattan is just step...
Name: description, dtype: object


In [36]:
def preprocess_and_extract_nmf_features(df, 
                                        description_col='description', 
                                        n_components=40, 
                                        max_features=1000, 
                                        random_state=42,
                                        top_n_words=8):
    """
    Preprocesses the 'description' column of a pandas DataFrame, extracts NMF features,
    and identifies top contributing words for each NMF component.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the 'description' column.
    - description_col (str): The name of the column containing property descriptions.
    - n_components (int): Number of NMF components to extract.
    - max_features (int): Maximum number of features for the TF-IDF vectorizer.
    - random_state (int): Random state for reproducibility.
    - top_n_words (int): Number of top words to display for each NMF component.

    Returns:
    - pd.DataFrame: A DataFrame containing the NMF features.
    - TfidfVectorizer: The fitted TF-IDF vectorizer.
    - NMF: The fitted NMF model.
    - dict: A dictionary with top words for each NMF component.
    """
    
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import NMF
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    import re
    
    # Initialize stopwords and lemmatizer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    def clean_text(text):
        # Lowercase
        text = text.lower()
        # Remove punctuation and non-alphabetic characters
        text = re.sub(r'[^a-z\s]', '', text)
        # Tokenize
        tokens = text.split()
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        # Join back into string
        return ' '.join(tokens)
    
    # Apply text cleaning
    df['clean_description'] = df[description_col].astype(str).apply(clean_text)
    
    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features,
                                       ngram_range=(1, 2))  
    
    # Fit and transform the clean descriptions
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_description'])
    
    # Initialize and fit NMF
    nmf_model = NMF(n_components=n_components, random_state=random_state)
    nmf_features = nmf_model.fit_transform(tfidf_matrix)
    
    # Create a DataFrame for NMF features
    nmf_feature_names = [f'nmf_feature_{i+1}' for i in range(n_components)]
    nmf_df = pd.DataFrame(nmf_features, columns=nmf_feature_names, index=df.index)
    
    # Extract top words for each NMF component
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words_dict = {}
    for i, topic in enumerate(nmf_model.components_):
        top_indices = topic.argsort()[::-1][:top_n_words]
        top_words = [feature_names[j] for j in top_indices]
        top_words_dict[f'nmf_feature_{i+1}'] = top_words
    
    return nmf_df, tfidf_vectorizer, nmf_model, top_words_dict

# Extract NMF features
nmf_df, tfidf_vectorizer, nmf_model, top_words_dict = preprocess_and_extract_nmf_features(data)

# Display NMF features
print(nmf_df.head())

# Display top words for each NMF component
for feature, words in top_words_dict.items():
    print(f'{feature}: {words}')

   nmf_feature_1  nmf_feature_2  ...  nmf_feature_39  nmf_feature_40
0       0.004458            0.0  ...        0.000000        0.001122
1       0.000465            0.0  ...        0.004788        0.032576
2       0.000088            0.0  ...        0.000000        0.015328
3       0.003704            0.0  ...        0.000000        0.000000
4       0.000000            0.0  ...        0.060400        0.000000

[5 rows x 40 columns]
nmf_feature_1: ['minute', 'minute walk', 'airport', 'walk', 'train', 'jfk', 'minute away', 'manhattan']
nmf_feature_2: ['nan', 'zoo', 'facing', 'filled', 'ferry', 'feel like', 'feel home', 'feel']
nmf_feature_3: ['building', 'elevator', 'view', 'elevator building', 'laundry', 'floor', 'located', 'central']
nmf_feature_4: ['br', 'br br', 'br apartment', 'please', 'br min', 'br located', 'hotel', 'br please']
nmf_feature_5: ['comfort', 'perfect', 'offer', 'heart', 'city', 'modern', 'vibrant', 'welcome']
nmf_feature_6: ['private', 'bathroom', 'private bathroom

In [35]:
import pandas as pd

# Assume nmf_features is a NumPy array of shape (n_samples, n_topics)
nmf_features_df = nmf_df.copy()
nmf_features_df['price'] = data['price'].values

# Calculate correlation matrix
correlation_matrix = nmf_features_df.corr()

# Extract correlations with price
topic_price_correlations = correlation_matrix['price'][:-1]  # Exclude the 'price' correlation with itself

# Display correlations
print(topic_price_correlations.sort_values(ascending=False))

nmf_feature_39    0.202113
nmf_feature_8     0.165156
nmf_feature_12    0.165137
nmf_feature_11    0.163946
nmf_feature_5     0.151474
nmf_feature_14    0.133378
nmf_feature_3     0.095600
nmf_feature_36    0.072483
nmf_feature_4     0.034960
nmf_feature_22    0.034727
nmf_feature_16    0.029955
nmf_feature_33    0.011914
nmf_feature_37    0.005079
nmf_feature_15    0.004216
nmf_feature_28    0.001574
nmf_feature_34    0.000086
nmf_feature_10   -0.002596
nmf_feature_21   -0.009811
nmf_feature_38   -0.011828
nmf_feature_2    -0.012553
nmf_feature_40   -0.015353
nmf_feature_13   -0.015429
nmf_feature_25   -0.015988
nmf_feature_26   -0.019187
nmf_feature_35   -0.021108
nmf_feature_23   -0.022635
nmf_feature_7    -0.027408
nmf_feature_18   -0.039716
nmf_feature_32   -0.039933
nmf_feature_24   -0.043949
nmf_feature_30   -0.048784
nmf_feature_20   -0.099736
nmf_feature_27   -0.100438
nmf_feature_29   -0.111710
nmf_feature_6    -0.119532
nmf_feature_1    -0.126186
nmf_feature_9    -0.126557
n

In [38]:
def preprocess_and_extract_rake_features(df, 
                                         description_col='description', 
                                         max_keywords=10, 
                                         min_keyword_length=1,
                                         max_features=1000):
    """
    Preprocesses the 'description' column of a pandas DataFrame, extracts RAKE keywords,
    and transforms them into a feature matrix.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame containing the 'description' column.
    - description_col (str): Name of the column with property descriptions.
    - max_keywords (int): Maximum number of keywords to extract per description.
    - min_keyword_length (int): Minimum number of words in a keyword.
    - max_features (int): Maximum number of unique keywords to consider as features.
    
    Returns:
    - pd.DataFrame: DataFrame containing RAKE features.
    - Rake: The fitted RAKE object.
    """
    
    # Initialize RAKE with English stopwords and specify minimum keyword length
    rake = Rake(stopwords=stopwords.words('english'), 
                min_length=min_keyword_length, 
                max_length=5)  # You can adjust max_length based on your data
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        # remove words in more than 90% of the documents
        text = re.sub(r'\b\w{1,2}\b', '', text)
        return text
    
    def lemmatize_keywords(keywords):
        return [' '.join([lemmatizer.lemmatize(word) for word in keyword.split()]) for keyword in keywords]
    
    # Clean descriptions
    df['clean_description'] = df[description_col].astype(str).apply(clean_text)
    
    # Extract keywords using RAKE
    def extract_keywords(text):
        rake.extract_keywords_from_text(text)
        keywords_with_scores = rake.get_ranked_phrases_with_scores()
        # Sort by score descending and take top N keywords
        sorted_keywords = sorted(keywords_with_scores, key=lambda x: x[0], reverse=True)
        top_keywords = [kw for score, kw in sorted_keywords[:max_keywords]]
        # Lemmatize keywords
        lemmatized = lemmatize_keywords(top_keywords)
        return lemmatized
    
    df['keywords'] = df['clean_description'].apply(extract_keywords)
    
    # Flatten the list of keywords to find the most common ones
    all_keywords = [keyword for sublist in df['keywords'] for keyword in sublist]
    # Optionally, you can apply frequency filtering here
    # For simplicity, we'll select the top 'max_features' most common keywords
    from collections import Counter
    keyword_counts = Counter(all_keywords)
    most_common_keywords = [kw for kw, count in keyword_counts.most_common(max_features)]
    
    # Initialize a keyword extractor based on the most common keywords
    # Create binary features indicating the presence of keywords
    def keyword_features(keywords):
        features = {}
        for kw in most_common_keywords:
            features[kw] = 1 if kw in keywords else 0
        return features
    
    # Apply to the DataFrame
    rake_features_df = pd.DataFrame(df['keywords'].apply(keyword_features).tolist()).fillna(0)
    
    # Optionally, you can keep the original DataFrame unchanged and return only the features
    return rake_features_df, rake

# Apply RAKE feature extraction
rake_features_df, rake = preprocess_and_extract_rake_features(
    data, 
    description_col='description', 
    max_keywords=10,        # Extract top 5 keywords per description
    min_keyword_length=1,  # Minimum 1 word per keyword
    max_features=20        # Consider top 20 most common keywords as features
)

# Display RAKE features
print(rake_features_df.columns)

Index(['stay', 'welcome', 'train', 'room', 'apartment', 'start living', 'nan',
       'relax', 'home', 'city', 'spacious', 'step', 'need', 'minute', 'enjoy',
       'subway', 'heart', 'onetime room cleaning', 'minute walk', 'step away'],
      dtype='object')
