In [None]:
# %pip install spacy

In [1]:
import pandas as pd
import numpy as np

import spacy
from spacy import displacy

In [2]:
file = "data/12/NewYorkCity/listings_big_clean.csv" 

df = pd.read_csv(file, low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36322 entries, 0 to 36321
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    36322 non-null  int64  
 1   id                            36322 non-null  int64  
 2   name                          36322 non-null  object 
 3   host_id                       36322 non-null  int64  
 4   host_since                    36322 non-null  object 
 5   host_verifications            36322 non-null  object 
 6   host_identity_verified        36322 non-null  bool   
 7   neighbourhood_cleansed        36322 non-null  object 
 8   neighbourhood_group_cleansed  36322 non-null  object 
 9   latitude                      36322 non-null  float64
 10  longitude                     36322 non-null  float64
 11  property_type                 36322 non-null  object 
 12  room_type                     36322 non-null  object 
 13  a

In [4]:
# Unnamed: 0 als reine Index-Spalte aus EDA ausschließen
df = df.drop(columns="Unnamed: 0", axis=1, errors="ignore")

for c in ["host_since", "first_review", "last_review"]:
    df[c] = pd.to_datetime(df[c], errors="coerce")

Das Standardmodell ist en_core_web_sm (Englisch). 
Das Modell wird mit dem folgenden Befehl in der console heruntergeladen

In [None]:
# !python -m spacy download en_core_web_sm

In [5]:
nlp = spacy.load("en_core_web_sm")

In [20]:
df["amenities"].values[5].replace("[", "").replace("]", "")

'"Stove", "TV", "Hot water", "Kitchen", "Hangers", "Refrigerator", "Fire extinguisher", "Bed linens", "Smoke alarm", "Clothing storage", "Hair dryer", "Wifi", "Iron", "Carbon monoxide alarm", "Heating", "Air conditioning", "Coffee maker", "Paid parking off premises", "Extra pillows and blankets", "Room-darkening shades", "Free street parking", "Host greets you"'

In [38]:
len(df)//4

9080

In [42]:
text = "".join([i.replace("[", "").replace("]", "") for i in df["amenities"].values[:10000]])
len(text) # 3328489
# Error: Text of length 3328489 exceeds maximum of 1000000

3328489

In [57]:
doc = nlp(text[:800000])

In [58]:
print([token.text for token in doc][:100])

['"', 'Dedicated', 'workspace', '"', ',', '"', 'Essentials', '"', ',', '"', 'Cooking', 'basics', '"', ',', '"', 'Stove', '"', ',', '"', 'Blender', '"', ',', '"', 'Dishes', 'and', 'silverware', '"', ',', '"', 'Hot', 'water', '"', ',', '"', 'Dining', 'table', '"', ',', '"', 'Dryer', '"', ',', '"', 'Washer', '"', ',', '"', 'Lock', 'on', 'bedroom', 'door', '"', ',', '"', 'Kitchen', '"', ',', '"', 'Bathtub', '"', ',', '"', 'Oven', '"', ',', '"', 'Refrigerator', '"', ',', '"', 'Wine', 'glasses', '"', ',', '"', 'Single', 'level', 'home', '"', ',', '"', 'Fire', 'extinguisher', '"', ',', '"', 'Fast', 'wifi', '\\u2013', '330', 'Mbps', '"', ',', '"', 'Luggage', 'dropoff', 'allowed', '"', ',', '"']


In [59]:
from collections import Counter

words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
len(words)

87591

In [60]:
# Häufigkeit für jedes Wort berechnen
word_freq = Counter(words)
# 10 häufigste Wörter auswählen
most_freq_words = word_freq.most_common(5)

for word in most_freq_words:
    print(word)

('alarm', 2955)
('wifi', 1961)
('dryer', 1843)
('kitchen', 1719)
('hot', 1638)


In [61]:
import pandas as pd
tokens = pd.DataFrame({"Token": [token.text for token in doc],
              "Lemma": [token.lemma_ for token in doc],
              "POS": [token.pos_ for token in doc],
              "Tag": [token.tag_ for token in doc],
              "Dep": [token.dep_ for token in doc]})
tokens[tokens.POS == "NOUN"]

Unnamed: 0,Token,Lemma,POS,Tag,Dep
2,workspace,workspace,NOUN,NN,nmod
6,Essentials,essential,NOUN,NNS,nmod
11,basics,basic,NOUN,NNS,appos
23,Dishes,dish,NOUN,NNS,conj
25,silverware,silverware,NOUN,NN,conj
...,...,...,...,...,...
232724,maker,maker,NOUN,NN,appos
232729,parking,parking,NOUN,NN,compound
232731,premises,premise,NOUN,NNS,appos
232736,products,product,NOUN,NNS,appos


In [62]:
# Häufigkeit für jedes Noun berechnen
nouns = tokens[tokens.POS == "NOUN"].Token
word_freq = Counter(nouns)
# 10 häufigste Wörter auswählen
most_freq_nouns = word_freq.most_common(10)

for word in most_freq_nouns:
    print(word)

('alarm', 2955)
('Wifi', 1836)
('Smoke', 1597)
('water', 1562)
('parking', 1383)
('Heating', 1378)
('dryer', 1367)
('monoxide', 1359)
('TV', 1289)
('Air', 1256)
