[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nils-holmberg/socs-qmd/blob/main/jnb/lab1_nlp1.ipynb)

# download data file

In [None]:
import pandas as pd


In [None]:
#!pip install gdown
!gdown https://drive.google.com/uc?id=1EMzJxxoBaN_NbvF7xhoc09K82vQ6H_LX

In [None]:
fp = "content.xlsx"
df = pd.read_excel(fp, header=None)
df.head()

In [None]:
fp = "https://raw.githubusercontent.com/nils-holmberg/socs-qmd/main/csv/content.tsv"
df = pd.read_csv(fp, header=None, sep="\t")
df.head()

In [None]:
df.columns = ['id', 'image', 'text']
df.head()

# analyze text data

In [None]:
import spacy

# Load spaCy's English language model
# You might need to run !python -m spacy download en_core_web_sm to download the model
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')


In [None]:
def spacy_clean_text(doc):
    # Tokenize, lemmatize, remove stop words and non-alphabetic tokens
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Load your data
data = df

# Apply spaCy preprocessing to the text column
data['spacy_cleaned_text'] = data['text'].apply(lambda x: spacy_clean_text(nlp(x)))

data.head()

In [None]:
# Create a frequency table
word_freq = data['spacy_cleaned_text'].str.split(expand=True).stack().value_counts()

# Display the frequency table
word_freq

In [None]:
# Converting the Series to a DataFrame

word_freq_df = word_freq.to_frame().reset_index()
word_freq_df.columns = ['term','freq']
word_freq_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the top N words
top_n = 10  # You can change this number to display more or fewer words
plt.figure(figsize=(6, 4))
sns.barplot(x='freq', y='term', data=word_freq_df.head(top_n), palette='viridis')

plt.title(f'Top {top_n} Most Frequent Words')
plt.xlabel('freq')
plt.ylabel('term')

plt.show()

In [None]:
# Define a function to process the text and return a spaCy Doc object
def process_text(text):
    doc = nlp(text)
    return [
        {
            'token': token.text,
            'lemma': token.lemma_,
            'part_of_speech': token.pos_,
            'entity': token.ent_type_ if token.ent_type_ else 'None'
        }
        for token in doc
    ]

# Apply the function to the 'text' column and store the results in a new column 'spacy_nlp'
data['spacy_nlp'] = data['text'].apply(process_text)

# Display the DataFrame with the new column
data.spacy_nlp.head()

In [None]:
# Flatten the spacy_nlp column and join with the id column
spacy_df = data.explode('spacy_nlp')
nlp_df = pd.concat([spacy_df[['id']], spacy_df['spacy_nlp'].apply(pd.Series)], axis=1)

# Display the new DataFrame
nlp_df

In [None]:
# Specify your desired output file path
fp = 'nlp.tsv'

# Save the DataFrame as a TSV file
nlp_df.to_csv(fp, sep='\t', index=False)

# vectorize text data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Creating a Bag of Words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(data['spacy_cleaned_text'])



In [None]:
# Converting BoW model to a DataFrame for better visibility
bow_df = pd.DataFrame(bow_model.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first few rows of the BoW DataFrame
print(bow_df.head())

In [None]:
#
vectorizer.vocabulary_

In [None]:
#
print(vectorizer.transform(data['spacy_cleaned_text']))

In [None]:
#
vectorizer.transform(data['spacy_cleaned_text']).toarray()

In [None]:
import os
import pandas as pd

def read_text_files(directory):
    # Initialize an empty list to store the file names and text content
    data = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a text file
        if filename.endswith('.txt'):
            # Construct the full file path
            file_path = os.path.join(directory, filename)

            # Read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append([filename, content])

    # Create a DataFrame with file names and text content
    df = pd.DataFrame(data, columns=['File Name', 'Content'])

    return df

# Specify the directory containing the text files
directory_path = 'data-text'  # Replace with your directory path

# Call the function and get the DataFrame
text_df = read_text_files(directory_path)

# Display the DataFrame
print(text_df)
