In [1]:
from google.colab import drive

# Google Drive'ı bağla
drive.mount('/content/drive')

Mounted at /content/drive


#Kütüphanelerin Kurulumu

In [2]:
!pip install python-docx
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load(

#PROJE KODLARI

In [19]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
import re
import docx
from collections import defaultdict
import spacy
import en_core_web_sm
import os


def download_nltk_resources():
    resources = [
        'punkt',
        'averaged_perceptron_tagger',
        'maxent_ne_chunker',
        'words',
        'stopwords',
        'punkt_tab'
    ]
    for resource in resources:
        try:
            nltk.download(resource, quiet=True)
        except Exception as e:
            print(f"Warning: Could not download {resource}: {str(e)}")


download_nltk_resources()

class TranscriptTransformer:
    def __init__(self):
        # NTLK datalarının indirilmesi
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('maxent_ne_chunker')
        nltk.download('words')
        nltk.download('stopwords')

        self.stop_words = set(stopwords.words('english'))
        self.nlp = spacy.load('en_core_web_sm')

        # Anahtar kelimelerin tanımalanması
        self.key_concepts = {
            'SRE': ['site reliability engineering', 'reliability', 'reliable', 'sre team', 'site reliability'],
            'SLA_SLO': ['service level agreement', 'service level objective', 'sla', 'slo', 'availability', 'reliability target'],
            'ERROR_BUDGETS': ['error budget', 'budget', 'availability target', 'reliability target'],
            'DEV_OPS': ['development team', 'operations team', 'dev team', 'ops team', 'developers', 'operators'],
            'MONITORING': ['monitoring', 'alerts', 'metrics', 'measurement', 'tracking', 'observability'],
            'AUTOMATION': ['automate', 'automation', 'automated', 'script', 'tooling'],
            'POSTMORTEM': ['postmortem', 'post-mortem', 'incident review', 'blameless', 'root cause']
        }

    def read_docx(self, file_path):

        doc = docx.Document(file_path)
        text = []
        for paragraph in doc.paragraphs:
            text.append(paragraph.text)
        return '\n'.join(text)

    def preprocess_text(self, text):

        # Cümle yapısını ve önemli noktalama işaretlerini koru
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9\s.,!?()-:]', '', text)
        return text.strip()

    def extract_key_topics(self, text):

        doc = self.nlp(text.lower())
        topic_mentions = defaultdict(int)

        # Anahtar kavramların geçme sıklığını say
        for concept, terms in self.key_concepts.items():
            for term in terms:
                topic_mentions[concept] += len(re.findall(r'\b' + term + r'\b', text.lower()))

        # Konuları sıklığa göre sırala
        sorted_topics = sorted(topic_mentions.items(), key=lambda x: x[1], reverse=True)
        return [topic for topic, count in sorted_topics if count > 0]

    def extract_learning_objectives(self, text):

        objectives = [
            "Understand the core principles of Site Reliability Engineering (SRE)",
            "Learn about SLA and SLO concepts and their importance",
            "Understand error budgets and their role in reliability",
            "Explore the relationship between Dev and Ops teams",
            "Learn about monitoring and automation practices",
            "Understand postmortem culture and blameless post-incident reviews"
        ]
        return objectives

    def extract_instructor_notes(self, section_text):
        #NLP kullanarak bölüm metninden eğitmen notları çıkar
        doc = self.nlp(section_text)

        # Bölüm uzunluğu ve karmaşıklığına göre süre tahmin et
        word_count = len(section_text.split())
        time = '15-20 dakika' if word_count > 500 else '10-15 dakika'

        # Cümle önemini kullanarak anahtar noktaları çıkar
        sentences = [sent.text.strip() for sent in doc.sents]
        key_points = []

        for sent in sentences:
            if any(pattern in sent.lower() for pattern in [
                'is important', 'key', 'critical', 'essential', 'fundamental',
                'means that', 'refers to', 'is defined as', 'plays a role'
            ]):
                # Clean and shorten the sentence to make it concise
                point = re.sub(r'^.*?(is|means|refers)', '', sent).strip()
                point = re.sub(r'[.!?]$', '', point).strip()
                if len(point.split()) <= 10 and point not in key_points:
                    key_points.append(point)

        # Yeterli anahtar nokta bulunamazsa
        if not key_points:
            key_points = [
                re.sub(r'[.!?]$', '', sent.strip())
                for sent in sentences[:3]
            ]

        key_points = key_points[:3]  # En önemli 3 noktayı al

        # Eylem kelimelerine göre aktiviteleri çıkar
        activities = []
        for sent in sentences:
            if any(word in sent.lower() for word in ['practice', 'exercise', 'analyze', 'review', 'discuss', 'examine']):
                activities.append(sent)

        activity = activities[0] if activities else "Group discussion on key concepts"

        # Soru kalıplarını kullanarak yaygın soruları oluştur
        doc_questions = [sent.text for sent in doc.sents if sent.text.strip().endswith('?')]
        common_questions = []

        for question in doc_questions:
            # Konularla ilgili yaygın soruları filtrele
            if any(word in question.lower() for word in ['how', 'what', 'why', 'when', 'which']):
                if len(question.split()) <= 12:  # Soruları kısa tut
                    common_questions.append(question)

        # Soru bulunamazsa
        if not common_questions:
            topic = section_text.split()[0] if section_text else "this topic"
            common_questions = [
                f"How can we implement {topic} effectively?",
                f"What are the main challenges in {topic}?"
            ]

        common_questions = common_questions[:2]

        return {
            'time': time,
            'key_points': key_points,
            'activities': activity,
            'common_questions': common_questions
        }

    def add_instructor_notes(self, section_name, section_content):

        notes = self.extract_instructor_notes(section_content)

        return f"""
        INSTRUCTOR NOTES FOR {section_name.upper()}:
        ----------------------------------------
        Time Allocation: {notes['time']}

        Key Points to Emphasize:
        {chr(10).join('- ' + point for point in notes['key_points'])}

        Suggested Activities:
        {notes['activities']}

        Common Questions to Prepare For:
        {chr(10).join('- ' + q for q in notes['common_questions'])}
        """

    def structure_content(self, text):

        sentences = sent_tokenize(text)
        structured_content = defaultdict(list)

        current_section = "Introduction"

        # Bölüm anahtar kelimelerini tanımla
        section_keywords = {
            'Introduction': ['introduction', 'background', 'overview'],
            'SRE Fundamentals': ['reliability', 'engineering', 'fundamental'],
            'SLA and SLO': ['sla', 'slo', 'service level', 'availability'],
            'Error Budgets': ['error budget', 'reliability target'],
            'Dev and Ops Collaboration': ['dev team', 'ops team', 'collaboration'],
            'Monitoring and Automation': ['monitoring', 'automate', 'automation'],
            'Postmortem and Culture': ['postmortem', 'blameless', 'culture'],
            'Case Studies': ['example', 'case', 'instance'],
            'Best Practices': ['practice', 'recommendation', 'approach'],
            'Summary': ['conclusion', 'summary', 'finally']
        }

        for sentence in sentences:
            sentence_lower = sentence.lower()

            # Bölümü anahtar kelimelerle belirle
            for section, keywords in section_keywords.items():
                if any(keyword in sentence_lower for keyword in keywords):
                    current_section = section
                    break

            structured_content[current_section].append(sentence)

        # Eğitmen notlarının eklenmesi gereken bölümü güncelle
        for section, sentences in structured_content.items():
            section_text = " ".join(sentences)
            structured_content[section] = {
                'content': sentences,
                'instructor_notes': self.extract_instructor_notes(section_text)
            }

        return structured_content

    def word_count_verification(self, text):

        words = text.split()
        count = len(words)
        if count < 3900:
            print(f"Warning: Text contains {count} words, minimum requirement is 3900 words")
            return False
        print(f"Word count requirement met: {count} words")
        return True

    def generate_section_summary(self, section_name, content):

        # Eğer content bir liste ise, metne dönüştür
        if isinstance(content, list):
            text = ' '.join(content)
        else:
            text = content

        doc = self.nlp(text)

        # Önemli desenleri ve konumları kullanarak ana cümleleri çıkar
        important_sentences = []

        # Önemli bilgiyi gösteren desenler
        importance_markers = [
            'key', 'important', 'essential', 'fundamental', 'primary',
            'focus', 'main', 'critical', 'crucial', 'significant',
            'introduces', 'explains', 'describes', 'covers', 'addresses'
        ]

        # İlk cümleyi al, çoğunlukla ana konuyu içerir
        if len(list(doc.sents)) > 0:
            important_sentences.append(list(doc.sents)[0].text)


        for sent in doc.sents:
            sent_text = sent.text.lower()
            if any(marker in sent_text for marker in importance_markers):
                if sent.text not in important_sentences:
                    important_sentences.append(sent.text)


        topic_terms = []
        topic_name = section_name.lower()

        for concept, terms in self.key_concepts.items():
            if any(term in topic_name for term in terms):
                topic_terms.extend(terms)

        for sent in doc.sents:
            sent_text = sent.text.lower()
            term_count = sum(1 for term in topic_terms if term in sent_text)
            if term_count >= 2 and sent.text not in important_sentences:
                important_sentences.append(sent.text)


        important_sentences = important_sentences[:4]


        summary = f"""
        Summary: {important_sentences[0]}
        {' '.join(important_sentences[1:])}
        """


        summary = re.sub(r'\s+', ' ', summary)
        summary = summary.strip()

        return summary

    def generate_teaching_transcript(self, input_file):

        raw_text = self.read_docx(input_file)
        cleaned_text = self.preprocess_text(raw_text)


        key_topics = self.extract_key_topics(cleaned_text)
        learning_objectives = self.extract_learning_objectives(cleaned_text)
        structured_content = self.structure_content(cleaned_text)


        output = []

        #Başlık Oluuşturma
        output.append("SITE RELIABILITY ENGINEERING (SRE) - TEACHING GUIDE\n")

        # Learning Objectives
        output.append("LEARNING OBJECTIVES:")
        for i, objective in enumerate(learning_objectives, 1):
            output.append(f"{i}. {objective}")
        output.append("\n")

        # Key Topics
        output.append("KEY TOPICS COVERED:")
        for i, topic in enumerate(key_topics, 1):
            output.append(f"{i}. {topic.replace('_', ' ').title()}")
        output.append("\n")

        for section, data in structured_content.items():
            output.append(f"\n{section.upper()}:")


            output.append(self.add_instructor_notes(section, " ".join(data['content'])))


            output.append(self.generate_section_summary(section, data['content']))


            output.append(" ".join(data['content']))
            output.append("\n")

        final_text = "\n".join(output)

        # Verify word count
        if not self.word_count_verification(final_text):
            print("Warning: Generated transcript may be too short for a 30-minute lecture")

        return final_text

    def save_teaching_transcript(self, output_text, output_file):

        doc = docx.Document()


        title = doc.add_heading('Site Reliability Engineering (SRE) - Teaching Guide', 0)


        sections = output_text.split('\n\n')
        for section in sections:
            if section.strip():
                if section.endswith(':'):

                    doc.add_heading(section, level=1)
                else:

                    para = doc.add_paragraph()
                    para.add_run(section)

        doc.save(output_file)

def main():
    transformer = TranscriptTransformer()


    input_file = "/content/drive/MyDrive/Case_Study_Omer_Guzeller/metin.docx"  #dosya yolu
    output_dir = "/content/drive/MyDrive/Case_Study_Omer_Guzeller/output"
    output_file = f"{output_dir}/egitim_materyali_transformu.docx"


    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found at {os.path.abspath(input_file)}")
        return

    try:
        raw_text = transformer.read_docx(input_file)
        if not raw_text.strip():
            print("Error: Input file is empty or could not be read properly")
            return

        teaching_transcript = transformer.generate_teaching_transcript(input_file)
        transformer.save_teaching_transcript(teaching_transcript, output_file)
        print(f"Teaching transcript successfully generated and saved to {output_file}")

        word_count = len(teaching_transcript.split())
        print(f"Generated transcript contains {word_count} words")

    except Exception as e:
        print(f"Error processing transcript: {str(e)}")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Input file absolute path: {os.path.abspath(input_file)}")
        print("Please ensure the input file is a valid DOCX file and has readable content.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word count requirement met: 8167 words
Teaching transcript successfully generated and saved to /content/drive/MyDrive/Case_Study_Omer_Guzeller/output/egitim_materyali_transformu.docx
Generated transcript contains 8167 words
