In [None]:
# Install required libraries
!pip install openai pandas transformers python-docx tqdm torch



In [None]:
import os
import pandas as pd
from docx import Document
import openai
from transformers import pipeline
from google.colab import drive
from tqdm import tqdm

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set OpenAI API key (Replace 'your-api-key' with your actual key)
client = openai.OpenAI(api_key="")  # Replace with your API key

In [None]:
# Load sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

Device set to use cpu


In [None]:
# Load NLP models
#summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-arxiv")  # BigBird for long texts
#sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
#tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")  # Tokenizer for counting tokens

In [None]:
# Define interviews folder in Drive
interviews_folder = "/content/drive/MyDrive/alaska_vaccine_project/interviews"

In [None]:
# Function to extract text from a Word document
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs]).strip()
    return text if text else None  # Return None if empty

In [None]:
# Function to summarize text using GPT-4
def summarize_with_gpt4(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo",  # Use GPT-4 Turbo for cost efficiency
            messages=[
                {"role": "system", "content": "Summarize the following interview concisely, highlighting key points."},
                {"role": "user", "content": text}
            ],
            temperature=0.3,  # Keep summaries consistent
            max_tokens=500  # Adjust summary length
        )
        return response.choices[0].message.content  # Extract GPT-4 response
    except Exception as e:
        print(f"⚠️ GPT-4 Error: {e}")
        return "Error in summarization"


In [None]:
# Function to analyze sentiment of the summary
def analyze_sentiment(text):
    try:
        return sentiment_analyzer(text)[0]["label"]
    except Exception as e:
        print(f"⚠️ Sentiment analysis error: {e}")
        return "Unknown"

In [None]:
# Function to process all interviews with a progress bar
def analyze_interviews(folder_path):
    results = []
    files = [f for f in os.listdir(folder_path) if f.endswith(".docx")]  # List all docx files

    for filename in tqdm(files, desc="Processing Interviews", unit="file"):
        file_path = os.path.join(folder_path, filename)
        text = extract_text_from_docx(file_path)

        if text is None:
            print(f"⚠️ Skipping empty document: {filename}")
            continue

        # Summarize using GPT-4
        summary = summarize_with_gpt4(text)

        # Perform sentiment analysis on the summary
        sentiment = analyze_sentiment(summary)

        results.append({
            "Interview_File": filename,
            "Summary": summary,
            "Sentiment": sentiment
        })

    return pd.DataFrame(results)

In [None]:
# Run script
df = analyze_interviews(interviews_folder)

# Save results to Drive
output_path = "/content/drive/MyDrive/alaska_vaccine_project/interview_sentiments.csv"
df.to_csv(output_path, index=False)
print(f"✅ Analysis complete. Results saved to: {output_path}")

# Show first few rows
df.head()

Processing Interviews:  66%|██████▌   | 57/87 [10:48<06:39, 13.32s/file]

⚠️ Sentiment analysis error: The expanded size of the tensor (523) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 523].  Tensor sizes: [1, 514]


Processing Interviews: 100%|██████████| 87/87 [17:01<00:00, 11.74s/file]

✅ Analysis complete. Results saved to: /content/drive/MyDrive/alaska_vaccine_project/interview_sentiments.csv





Unnamed: 0,Interview_File,Summary,Sentiment
0,Anthony van Weel.docx,"Anthony van Weel, a 21-year-old software devel...",LABEL_1
1,Brenton.docx,"Brent Strickland, a 50-year-old who works at t...",LABEL_2
2,Bryan thomas.docx,"The interview with Bryan Thomas, a 50-year-old...",LABEL_1
3,Gloria.docx,The interview primarily discusses the impact o...,LABEL_1
4,Fadwa.docx,"In the interview, Fadwa, a 53-year-old student...",LABEL_1
