In [1]:
import os
import pandas as pd
import spacy
import matplotlib.pyplot as plt
from collections import Counter
import re
import matplotlib.pyplot as plt
from collections import Counter

from google.colab import drive
drive.mount('/content/drive')
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# File paths
input_folder = "/content/drive/My Drive/test1/About"
output_chart_ws = "/content/drive/My Drive/test1/common100_ws.jpg"
output_chart_wos = "/content/drive/My Drive/test1/common100_wos.jpg"
output_chart_innov = "/content/drive/My Drive/test1/innov_freqs.jpg"
output_csv = "/content/drive/My Drive/test1/innov_aussie_data.csv"

# Load text files into DataFrame
file_data = []
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(input_folder, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()
        file_data.append({"name": filename.split(".")[0], "filepath": filepath, "text": text})

df = pd.DataFrame(file_data)

# Preprocess text

def preprocess_text(text, remove_stopwords=False):
    doc = nlp(text.lower())
    if remove_stopwords:
        return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])
    return " ".join([token.lemma_ for token in doc if token.is_alpha])

df["preprocessed_ws"] = df["text"].apply(lambda x: preprocess_text(x, remove_stopwords=False))
df["preprocessed_wos"] = df["text"].apply(lambda x: preprocess_text(x, remove_stopwords=True))

# Count word frequencies
counter_ws = Counter(" ".join(df["preprocessed_ws"]).split())
counter_wos = Counter(" ".join(df["preprocessed_wos"]).split())

# Function to plot most common words
def plot_common_words(counter, output_path, title, num_words=100):
    common_words = counter.most_common(num_words)
    words, counts = zip(*common_words)
    plt.figure(figsize=(15, 6))
    plt.plot(words, counts, marker="o")
    plt.xticks(rotation=90)
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Generate and save charts
plot_common_words(counter_ws, output_chart_ws, "Top 100 Words (With Stopwords)")
plot_common_words(counter_wos, output_chart_wos, "Top 100 Words (Without Stopwords)")

# Load 'Innovativeness' dictionary words manually
innov_words = {"innovation", "innovative", "creativity", "entrepreneurial", "disruptive", "novelty", "breakthrough", "pioneering"}

# Filter words for 'Innovativeness' dictionary
filtered_innov_counter = Counter({word: counter_ws[word] for word in counter_ws if word in innov_words})
plot_common_words(filtered_innov_counter, output_chart_innov, "Innovativeness Word Frequencies", num_words=20)

# Compute 'innov_ws' and 'innov_perwd_ws'
def count_innov_words(text):
    words = text.split()
    innov_count = sum(1 for word in words if word in innov_words)
    return innov_count, innov_count / len(words) if words else 0

df["innov_ws"], df["innov_perwd_ws"] = zip(*df["preprocessed_ws"].apply(count_innov_words))

# Save final dataset
df.to_csv(output_csv, index=False)

print("Preprocessing and analysis complete. Outputs saved.")

Mounted at /content/drive
Preprocessing and analysis complete. Outputs saved.
