L2-regularized logistic regression for binary or multiclass classification; trains a model (on `train.txt`), optimizes L2 regularization strength on `dev.txt`, and evaluates performance on `test.txt`.  Reports test accuracy with 95% confidence intervals and prints out the strongest coefficients for each class.

In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
import re
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from scipy.stats import norm

In [None]:
!python -m nltk.downloader punkt
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# If you have your folder of data on your Google drive account, you can connect that here
from google.colab import drive
drive.mount('/content/drive')

# Change this to the directory with your data
directory="/content/drive/MyDrive/"

trainingFile = directory + "splits/train.txt"
devFile = directory + "splits/dev.txt"
testFile = directory + "splits/test.txt"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def load_data(filename):
    X = []
    Y = []
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            X.append(text)
            Y.append(label)

    return X, Y


In [None]:
class Classifier:

    def __init__(self, feature_method, trainX, trainY, devX, devY, testX, testY):
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_reg = None

        self.trainY=trainY
        self.devY=devY
        self.testY=testY
        
        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

        self.devX2 = devX

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):
        
        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    # Train model and evaluate on held-out data

    def train(self):
        (D,F) = self.trainX.shape
        best_dev_accuracy=0
        best_model=None
        input = []
        predicted = []
        actual = []
        # for C in [0.1, 1, 10, 100, 200, 500, 1000]:
        for C in [10]:
            self.log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
            self.log_reg.fit(self.trainX, self.trainY)
            training_accuracy = self.log_reg.score(self.trainX, self.trainY)
            input = input + self.devX2
            predicted = predicted + self.log_reg.predict(self.devX).tolist()
            actual = actual + self.devY
            development_accuracy = self.log_reg.score(self.devX, self.devY)
            if development_accuracy > best_dev_accuracy:
                best_dev_accuracy=development_accuracy
                best_model=self.log_reg

            print("C: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (C, training_accuracy, development_accuracy))
        output = np.array([input, predicted, actual]).T.tolist()
        df = pd.DataFrame(output)

        print("\nDev Label Predicted/Expected")
        print(df.to_string())

        gb1 = df.drop(columns=[0]).groupby(by=1).count()
        print("\nPredicted Counts")
        print(gb1.to_string())

        gb2 = df.drop(columns=[0]).groupby(by=2).count()
        print("\nExpected Counts")
        print(gb2.to_string())

        gb3 = df.groupby([1,2]).count()
        print("\nPredicted/Expected Counts")
        print(gb3.to_string())
        self.log_reg=best_model
        

    def test(self):
        return self.log_reg.score(self.testX, self.testY)
        

    def printWeights(self, n=10):

        reverse_vocab=[None]*len(self.log_reg.coef_[0])
        for k in self.feature_vocab:
            reverse_vocab[self.feature_vocab[k]]=k

        # binary
        if len(self.log_reg.classes_) == 2:
              weights=self.log_reg.coef_[0]

              cat=self.log_reg.classes_[1]
              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

              cat=self.log_reg.classes_[0]
              for feature, weight in list(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1)))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

        # multiclass
        else:
          for i, cat in enumerate(self.log_reg.classes_):

              weights=self.log_reg.coef_[i]

              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

In [None]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1
            
    return feats

In [None]:
def bag_of_words(text):
    words = nltk.word_tokenize(text)
    feats = {}
    for word in words:
        feats[word] = feats.get(word, 0) + 1
    return feats

In [None]:
def word_length(text):
    words = nltk.word_tokenize(text)
    feats = {}
    for word in words:
        length = len(word)
        feats[length] = feats.get(length, 0) + 1
    return feats

In [None]:
def text_length(text):
    words = nltk.word_tokenize(text)
    feats = {}
    feats[len(words)] = 1
    return feats

In [None]:
def capital(text):
    feature = {}
    feature['capital_letter_count'] = sum(1 for char in text if char.isupper())
    return feature

In [None]:
def ngram(text, n=2):
    words = nltk.word_tokenize(text)
    feats = {}
    for i in range(len(words) - n + 1):
        gram = ' '.join(words[i:i+n])
        feats[gram] = feats.get(gram, 0) + 1
    return feats

In [None]:
def pos_tags(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    feats = {}
    for _, pos_tag in pos_tags:
        feats[pos_tag] = feats.get(pos_tag, 0) + 1
    return feats

In [None]:
def char_ngram(text, n=4):
    feats = {}
    for i in range(len(text) - n + 1):
        gram = text[i:i+n]
        feats[gram] = feats.get(gram, 0) + 1
    return feats

In [None]:
luxury_keywords = ['elegant', 'balcony', 'luxe', 'luxurious', 'luxury', 'unique', 'private', 'spa', 'waterfront', 'sauna', 'views', 'villa', 'mansion', 'manor', 'penthouse', 'king', 'queen', 'resort', 'oceanfront', 'pool', 'rooftop', 'jacuzzi', 'loft', 'property', '5-star', 'estate', 'royal']
family_friendly_keywords = ['kids', 'backyard', 'duplex', 'groups', 'family', 'family-friendly', 'families', 'sleeps', 'playground', 'games', 'home', 'house', 'townhouse', 'safe', 'quiet', 'yard', 'park', 'outdoor', 'pup', 'pet', 'dog']
nature_keywords = ['nature', 'natural', 'creek', 'woods', 'hike', 'oasis', 'country', 'rural', 'scenic', 'tranquil', 'peaceful',  'hiking', 'beach', 'forest', 'cabin', 'lodge', 'wooded', 'campsite', 'river', 'trees', 'mountain', 'ski', 'cottage', 'garden', 'sea', 'ocean', 'retreat', 'escape', 'outdoor']
budget_keywords = ['budget', 'low', 'room', 'free', 'save', 'bunk', 'deal', 'price', 'discount', '1br', 'single', 'clean', 'sanitized', 'charming', 'cozy', 'one', 'bedroom', 'tiny', 'inn', 'hostel', 'studio', 'budget', 'friendly', 'affordable', 'economical', 'wifi', 'rv', 'parking', '1 bedroom', '1 bath', 'loft']
convenience_keywords = ['conveniently', 'convenient', 'convenience' 'walkable', 'walk', 'walking', 'distance', 'miles', 'steps', 'from', 'close', 'by', 'to', 'near', 'downtown', 'heart', 'located', 'location', 'minutes', 'short', 'drive', 'across', 'minute', 'off']

In [None]:
# possible luxury words: chic, modern

In [None]:
def custom(text):
    feats = {}

    keywords_list = [luxury_keywords, family_friendly_keywords, nature_keywords, budget_keywords, convenience_keywords]
    label_list = ["luxury", "family-friendly", "nature", "budget", "convenience"]

    words = nltk.word_tokenize(text.lower())


    for i in range(len(keywords_list)):
      keywords = keywords_list[i]
      for word in keywords:
        if word in words:
          feats[label_list[i]] = feats.get(label_list[i], 0) + 1
        
    # for token in words:
    #   if re.match("^[0-9]+$", token) and len(token) > 1:
    #     feats["convenience"] = feats.get("convenience", 0) + 1
    #   elif re.match("^[0-9]$", token):
    #     feats["budget"] = feats.get("budget", 0) + 1
    # feature = {word: (word in words) for word in keywords}
    return feats

In [None]:
def combiner_function(text):

    # Here the `all_feats` dict should contain the features -- the key should be the feature name, 
    # and the value is the feature value.  See `simple_featurize` for an example.
    # at the moment, all 4 of: bag of words and your 3 original features are handed off to the combined model
    # update the values within [bag_of_words, feature1, feature2, feature3] to change this.
    
    all_feats={}
    for feature in [binary_bow_featurize, custom, char_ngram, ]:
        all_feats.update(feature(text))
    return all_feats

In [None]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [None]:

# def print_confusion(classifier):
#     fig, ax = plt.subplots(figsize=(10,10))
#     ConfusionMatrixDisplay(classifier.log_reg, classifier.devX, classifier.devY, ax=ax, xticks_rotation="vertical", values_format="d")
#     plt.show()

def print_confusion(classifier):
    y_true = classifier.testY
    y_pred = classifier.predict(classifier.testX)
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.target_names)
    disp.plot()


In [None]:
def run(trainingFile, devFile, testFile):
    trainX, trainY=load_data(trainingFile)
    devX, devY=load_data(devFile)
    testX, testY=load_data(testFile)
    simple_classifier = Classifier(combiner_function, trainX, trainY, devX, devY, testX, testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()
    
    lower, upper=confidence_intervals(accuracy, len(testY), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

    simple_classifier.printWeights()
    # print_confusion(simple_classifier)
    


In [None]:
run(trainingFile, devFile, testFile)

# dictionary -> ngrams? bag of words
# listing length
# capital letter count

C: 10, Train accuracy: 1.000, Dev accuracy: 0.663

Dev Label Predicted/Expected
                                                                                                         0                1                2
0                                                                                                listing\n            label            label
1                                                                                        Home Sweet Home\n  Family-friendly  Family-friendly
2                                                             ‚òÄ Sun-drenched Chic "The SOHO Cabin" ‚ùÑ\n           Nature           Luxury
3                                                             Minimalist Chic Studio With Private Garage\n           Luxury           Luxury
4                                                       Cozy Room in Historic Neighborhood close to City\n      Convenience      Convenience
5                                                                         